diff --git a/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt b/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt new file mode 100644 index 00000000..7a71526d --- /dev/null +++ b/local/logs/debug/prompts/20251029-223021-api_sent_message_0.txt @@ -0,0 +1,72 @@ +Message 0 (user) +Length: 2015 chars +================================================================================ +User request: "Generate the first 1000 prime numbers." + +Generate a NEW, COMPLETE JSON response. The template below shows ONLY the structure pattern - it is NOT existing content. Start from the beginning. + +JSON structure template (reference only - shows the pattern): +{ + "metadata": { + "split_strategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [ + { + "id": "doc_1", + "title": "Generated Document", + "filename": "document.json", + "sections": [ + { + "id": "section_heading_example", + "content_type": "heading", + "elements": [ + {"level": 1, "text": "Heading Text"} + ], + "order": 0 + }, + { + "id": "section_paragraph_example", + "content_type": "paragraph", + "elements": [ + {"text": "Paragraph text content"} + ], + "order": 0 + }, + { + "id": "section_list_example", + "content_type": "list", + "elements": [ + { + "items": [ + {"text": "Item 1"}, + {"text": "Item 2"} + ], + "list_type": "numbered" + } + ], + "order": 0 + }, + { + "id": "section_table_example", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [ + ["Row 1 Col 1", "Row 1 Col 2"], + ["Row 2 Col 1", "Row 2 Col 2"] + ], + "caption": "Table caption" + } + ], + "order": 0 + }, + { + "id": "section_code_example", + "content_type": "code", + "elements": [ + { + "code... [truncated] +================================================================================ diff --git a/local/logs/debug/prompts/20251029-223033-api_received_response.txt b/local/logs/debug/prompts/20251029-223033-api_received_response.txt new file mode 100644 index 00000000..31f8964c --- /dev/null +++ b/local/logs/debug/prompts/20251029-223033-api_received_response.txt @@ -0,0 +1,200 @@ +API Response +Length: 6082 chars +================================================================================ +{ + "metadata": { + "split_strategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [ + { + "id": "doc_1", + "title": "List of First 1000 Prime Numbers", + "filename": "prime_numbers.json", + "sections": [ + { + "id": "section_prime_numbers", + "content_type": "list", + "elements": [ + { + "items": [ + {"number": 2}, + {"number": 3}, + {"number": 5}, + {"number": 7}, + {"number": 11}, + {"number": 13}, + {"number": 17}, + {"number": 19}, + {"number": 23}, + {"number": 29}, + {"number": 31}, + {"number": 37}, + {"number": 41}, + {"number": 43}, + {"number": 47}, + {"number": 53}, + {"number": 59}, + {"number": 61}, + {"number": 67}, + {"number": 71}, + {"number": 73}, + {"number": 79}, + {"number": 83}, + {"number": 89}, + {"number": 97}, + {"number": 101}, + {"number": 103}, + {"number": 107}, + {"number": 109}, + {"number": 113}, + {"number": 127}, + {"number": 131}, + {"number": 137}, + {"number": 139}, + {"number": 149}, + {"number": 151}, + {"number": 157}, + {"number": 163}, + {"number": 167}, + {"number": 173}, + {"number": 179}, + {"number": 181}, + {"number": 191}, + {"number": 193}, + {"number": 197}, + {"number": 199}, + {"number": 211}, + {"number": 223}, + {"number": 227}, + {"number": 229}, + {"number": 233}, + {"number": 239}, + {"number": 241}, + {"number": 251}, + {"number": 257}, + {"number": 263}, + {"number": 269}, + {"number": 271}, + {"number": 277}, + {"number": 281}, + {"number": 283}, + {"number": 293}, + {"number": 307}, + {"number": 311}, + {"number": 313}, + {"number": 317}, + {"number": 331}, + {"number": 337}, + {"number": 347}, + {"number": 349}, + {"number": 353}, + {"number": 359}, + {"number": 367}, + {"number": 373}, + {"number": 379}, + {"number": 383}, + {"number": 389}, + {"number": 397}, + {"number": 401}, + {"number": 409}, + {"number": 419}, + {"number": 421}, + {"number": 431}, + {"number": 433}, + {"number": 439}, + {"number": 443}, + {"number": 449}, + {"number": 457}, + {"number": 461}, + {"number": 463}, + {"number": 467}, + {"number": 479}, + {"number": 487}, + {"number": 491}, + {"number": 499}, + {"number": 503}, + {"number": 509}, + {"number": 521}, + {"number": 523}, + {"number": 541}, + {"number": 547}, + {"number": 557}, + {"number": 563}, + {"number": 569}, + {"number": 571}, + {"number": 577}, + {"number": 587}, + {"number": 593}, + {"number": 599}, + {"number": 601}, + {"number": 607}, + {"number": 613}, + {"number": 617}, + {"number": 619}, + {"number": 631}, + {"number": 641}, + {"number": 643}, + {"number": 647}, + {"number": 653}, + {"number": 659}, + {"number": 661}, + {"number": 673}, + {"number": 677}, + {"number": 683}, + {"number": 691}, + {"number": 701}, + {"number": 709}, + {"number": 719}, + {"number": 727}, + {"number": 733}, + {"number": 739}, + {"number": 743}, + {"number": 751}, + {"number": 757}, + {"number": 761}, + {"number": 769}, + {"number": 773}, + {"number": 787}, + {"number": 797}, + {"number": 809}, + {"number": 811}, + {"number": 821}, + {"number": 823}, + {"number": 827}, + {"number": 829}, + {"number": 839}, + {"number": 853}, + {"number": 857}, + {"number": 859}, + {"number": 863}, + {"number": 877}, + {"number": 881}, + {"number": 883}, + {"number": 887}, + {"number": 907}, + {"number": 911}, + {"number": 919}, + {"number": 929}, + {"number": 937}, + {"number": 941}, + {"number": 947}, + {"number": 953}, + {"number": 967}, + {"number": 971}, + {"number": 977}, + {"number": 983}, + {"number": 991}, + {"number": 997} + ], + "list_type": "numbered" + } + ], + "order": 0 + } + ] + } + ] +} +================================================================================ diff --git a/modules/interfaces/interfaceAiObjects.py b/modules/interfaces/interfaceAiObjects.py index 6c12d267..e58fa1ef 100644 --- a/modules/interfaces/interfaceAiObjects.py +++ b/modules/interfaces/interfaceAiObjects.py @@ -92,18 +92,6 @@ class AiObjects: # Input bytes will be calculated inside _callWithModel - # Compress optionally (prompt/context) - simple truncation fallback kept here - def _maybeTruncate(text: str, limit: int) -> str: - data = text.encode("utf-8") - if len(data) <= limit: - return text - return data[:limit].decode("utf-8", errors="ignore") + "... [truncated]" - - if options.compressPrompt and len(prompt.encode("utf-8")) > 2000: - prompt = _maybeTruncate(prompt, 2000) - if options.compressContext and len(context.encode("utf-8")) > 70000: - context = _maybeTruncate(context, 70000) - # Generation parameters are handled inside _callWithModel # Get failover models for this operation type @@ -129,7 +117,7 @@ class AiObjects: try: logger.info(f"Attempting AI call with model: {model.name} (attempt {attempt + 1}/{len(failoverModelList)})") - # Call the model + # Call the model directly - no truncation or compression here response = await self._callWithModel(model, prompt, context, options) logger.info(f"✅ AI call successful with model: {model.name}") diff --git a/modules/services/serviceAi/subCoreAi.py b/modules/services/serviceAi/subCoreAi.py index cd177cc9..4e9d1bf6 100644 --- a/modules/services/serviceAi/subCoreAi.py +++ b/modules/services/serviceAi/subCoreAi.py @@ -124,7 +124,9 @@ Respond with ONLY a JSON object in this exact format: self, prompt: str, options: AiCallOptions, - debugPrefix: str = "ai_call" + debugPrefix: str = "ai_call", + promptBuilder: Optional[callable] = None, + promptArgs: Optional[Dict[str, Any]] = None ) -> str: """ Shared core function for AI calls with repair-based looping system. @@ -141,6 +143,7 @@ Respond with ONLY a JSON object in this exact format: max_iterations = 50 # Prevent infinite loops iteration = 0 allSections = [] # Accumulate all sections across iterations + lastRawResponse = None # Store last raw JSON response for continuation logger.debug(f"Starting AI call with repair-based looping (debug prefix: {debugPrefix})") @@ -149,14 +152,18 @@ Respond with ONLY a JSON object in this exact format: logger.debug(f"AI call iteration {iteration}/{max_iterations}") # Build iteration prompt - if len(allSections) > 0: - # This is a continuation - build continuation context - continuationContext = buildContinuationContext(allSections) - logger.info(f"Continuation context: {continuationContext.get('section_count')} sections, next order: {continuationContext.get('next_order')}") + if len(allSections) > 0 and promptBuilder and promptArgs: + # This is a continuation - build continuation context with raw JSON and rebuild prompt + continuationContext = buildContinuationContext(allSections, lastRawResponse) + logger.info(f"Continuation context: {continuationContext.get('section_count')} sections") + if lastRawResponse: + logger.debug(f"Iteration {iteration}: Including previous response in continuation context ({len(lastRawResponse)} chars)") + else: + logger.warning(f"Iteration {iteration}: No previous response available for continuation!") - # If prompt contains a placeholder for continuation, inject the context - # For now, we'll handle this at the calling code level - iterationPrompt = prompt + # Rebuild prompt with continuation context using the provided prompt builder + iterationPrompt = await promptBuilder(**promptArgs, continuationContext=continuationContext) + logger.debug(f"Rebuilt prompt with continuation context for iteration {iteration}") else: # First iteration - use original prompt iterationPrompt = prompt @@ -179,6 +186,13 @@ Respond with ONLY a JSON object in this exact format: response = await self.aiObjects.call(request) result = response.content + # Debug: Check response immediately from API + if iteration == 1 and result: + first_chars = result[:200].replace('\n', '\\n').replace('\r', '\\r') + logger.debug(f"Iteration 1: Raw API response starts with (first 200 chars): '{first_chars}'") + if result.strip().startswith('},') or result.strip().startswith('],'): + logger.error(f"Iteration 1: API returned fragment! Full start: '{result[:200]}'") + # Write raw AI response to debug file if iteration == 1: self.services.utils.writeDebugFile(result, f"{debugPrefix}_response") @@ -196,10 +210,23 @@ Respond with ONLY a JSON object in this exact format: logger.warning(f"Iteration {iteration}: Empty response, stopping") break + # Store raw response for continuation (even if broken) + lastRawResponse = result + + # Check for complete_response flag in raw response (before parsing) + import re + if re.search(r'"complete_response"\s*:\s*true', result, re.IGNORECASE): + logger.info(f"Iteration {iteration}: Detected complete_response flag in raw response") + # Extract sections from response (handles both valid and broken JSON) extractedSections, wasJsonComplete = self._extractSectionsFromResponse(result, iteration, debugPrefix) if not extractedSections: + # If we're in continuation mode and JSON was incomplete, don't stop - continue to allow retry + if iteration > 1 and not wasJsonComplete: + logger.warning(f"Iteration {iteration}: No sections extracted from continuation fragment, continuing for another attempt") + continue + # Otherwise, stop if no sections logger.warning(f"Iteration {iteration}: No sections extracted, stopping") break @@ -208,7 +235,7 @@ Respond with ONLY a JSON object in this exact format: logger.info(f"Iteration {iteration}: Extracted {len(extractedSections)} sections (total: {len(allSections)})") # Check if we should continue (completion detection) - if self._shouldContinueGeneration(allSections, iteration, wasJsonComplete): + if self._shouldContinueGeneration(allSections, iteration, wasJsonComplete, result): logger.debug(f"Iteration {iteration}: Continuing generation") continue else: @@ -241,6 +268,7 @@ Respond with ONLY a JSON object in this exact format: """ Extract sections from AI response, handling both valid and broken JSON. Uses repair mechanism for broken JSON. + Checks for "complete_response": true flag to determine completion. Returns (sections, wasJsonComplete) """ # First, try to parse as valid JSON @@ -248,14 +276,35 @@ Respond with ONLY a JSON object in this exact format: extracted = extractJsonString(result) parsed_result = json.loads(extracted) + # Check if AI marked response as complete + isComplete = parsed_result.get("complete_response", False) == True + if isComplete: + logger.info(f"Iteration {iteration}: AI marked response as complete (complete_response: true)") + # Extract sections from parsed JSON sections = extractSectionsFromDocument(parsed_result) logger.debug(f"Iteration {iteration}: Valid JSON - extracted {len(sections)} sections") - return sections, True # JSON was complete + + # If AI marked as complete, always return as complete + if isComplete: + return sections, True + + # If in continuation mode (iteration > 1), continuation responses are expected to be fragments + # A fragment with 0 extractable sections means JSON is incomplete - need another iteration + # Don't use repair mechanism - just mark as incomplete so loop continues + if len(sections) == 0 and iteration > 1: + logger.info(f"Iteration {iteration}: Continuation fragment with 0 extractable sections - JSON incomplete, continuing") + return sections, False # Mark as incomplete so loop continues + + # First iteration with 0 sections means empty response - stop + if len(sections) == 0: + return sections, True # Complete but empty + + return sections, True # JSON was complete with sections except json.JSONDecodeError as e: - # Broken JSON - try repair mechanism - logger.warning(f"Iteration {iteration}: Invalid JSON, attempting repair: {str(e)}") + # Broken JSON - try repair mechanism (normal in iterative generation) + logger.info(f"Iteration {iteration}: JSON incomplete/broken, attempting repair: {str(e)}") self.services.utils.writeDebugFile(result, f"{debugPrefix}_broken_json_iteration_{iteration}") # Try to repair @@ -279,16 +328,25 @@ Respond with ONLY a JSON object in this exact format: self, allSections: List[Dict[str, Any]], iteration: int, - wasJsonComplete: bool + wasJsonComplete: bool, + rawResponse: str = None ) -> bool: """ - Determine if generation should continue based on JSON completeness. + Determine if generation should continue based on JSON completeness and complete_response flag. Returns True if we should continue, False if done. """ if len(allSections) == 0: return True # No sections yet, continue - # Simple rule: if JSON was complete, we're done + # Check for complete_response flag in raw response + if rawResponse: + import re + # Look for complete_response: true pattern (allowing for whitespace variations) + if re.search(r'"complete_response"\s*:\s*true', rawResponse, re.IGNORECASE): + logger.info("AI marked response as complete (complete_response: true) - stopping generation") + return False + + # If JSON was complete (and no complete_response flag), we're done # If JSON was broken and repaired, continue to get more content if wasJsonComplete: logger.info("JSON was complete - stopping generation") @@ -398,6 +456,15 @@ Respond with ONLY a JSON object in this exact format: else: logger.debug(f"Using provided options: operationType={options.operationType}, priority={options.priority}") + # CRITICAL: For document generation with JSON templates, NEVER compress the prompt + # Compressing would truncate the template structure and confuse the AI + if outputFormat: # Document generation with structured output + if not options: + options = AiCallOptions() + options.compressPrompt = False # JSON templates must NOT be truncated + options.compressContext = False # Context also should not be compressed + logger.debug("Document generation detected - disabled prompt/context compression") + # Handle document generation with specific output format using unified approach if outputFormat: # Use unified generation method for all document generation @@ -411,7 +478,22 @@ Respond with ONLY a JSON object in this exact format: from modules.services.serviceGeneration.subPromptBuilderGeneration import buildGenerationPrompt # First call without continuation context generation_prompt = await buildGenerationPrompt(outputFormat, prompt, title, extracted_content, None) - generated_json = await self._callAiWithLooping(generation_prompt, options, "document_generation") + + # Prepare prompt builder arguments for continuation + promptArgs = { + "outputFormat": outputFormat, + "userPrompt": prompt, + "title": title, + "extracted_content": extracted_content + } + + generated_json = await self._callAiWithLooping( + generation_prompt, + options, + "document_generation", + buildGenerationPrompt, + promptArgs + ) # Parse the generated JSON (extract fenced/embedded JSON first) try: diff --git a/modules/services/serviceGeneration/subPromptBuilderGeneration.py b/modules/services/serviceGeneration/subPromptBuilderGeneration.py index 0cf32bf7..895d9af8 100644 --- a/modules/services/serviceGeneration/subPromptBuilderGeneration.py +++ b/modules/services/serviceGeneration/subPromptBuilderGeneration.py @@ -9,6 +9,7 @@ from typing import Dict, Any logger = logging.getLogger(__name__) # Centralized JSON structure template for document generation +# Includes examples for all content types so AI knows the structure patterns TEMPLATE_JSON_DOCUMENT_GENERATION = """{ "metadata": { "split_strategy": "single_document", @@ -22,16 +23,60 @@ TEMPLATE_JSON_DOCUMENT_GENERATION = """{ "filename": "document.json", "sections": [ { - "id": "section_1", - "content_type": "heading|paragraph|table|list|code", + "id": "section_heading_example", + "content_type": "heading", "elements": [ - // heading: {"level": 1, "text": "..."} - // paragraph: {"text": "..."} - // table: {"headers": [...], "rows": [[...]], "caption": "..."} - // list: {"items": [{"text": "...", "subitems": [...]}], "list_type": "bullet|numbered"} - // code: {"code": "...", "language": "..."} + {"level": 1, "text": "Heading Text"} ], - "order": 1 + "order": 0 + }, + { + "id": "section_paragraph_example", + "content_type": "paragraph", + "elements": [ + {"text": "Paragraph text content"} + ], + "order": 0 + }, + { + "id": "section_list_example", + "content_type": "list", + "elements": [ + { + "items": [ + {"text": "Item 1"}, + {"text": "Item 2"} + ], + "list_type": "numbered" + } + ], + "order": 0 + }, + { + "id": "section_table_example", + "content_type": "table", + "elements": [ + { + "headers": ["Column 1", "Column 2"], + "rows": [ + ["Row 1 Col 1", "Row 1 Col 2"], + ["Row 2 Col 1", "Row 2 Col 2"] + ], + "caption": "Table caption" + } + ], + "order": 0 + }, + { + "id": "section_code_example", + "content_type": "code", + "elements": [ + { + "code": "function example() { return true; }", + "language": "javascript" + } + ], + "order": 0 } ] } @@ -48,10 +93,10 @@ async def buildGenerationPrompt( ) -> str: """ Build the unified generation prompt using a single JSON template. - Simplified version without continuation logic in prompt. + Generic solution that works for any user request. Args: - outputFormat: Target output format (html, pdf, docx, etc.) + outputFormat: Target output format (html, pdf, docx, etc.) - not used in prompt userPrompt: User's original prompt for document generation title: Title for the document extracted_content: Optional extracted content from documents to prepend to prompt @@ -64,63 +109,88 @@ async def buildGenerationPrompt( title_value = title if title else "Generated Document" json_template = TEMPLATE_JSON_DOCUMENT_GENERATION.replace("{{DOCUMENT_TITLE}}", title_value) - # Check if this is a continuation request - if continuationContext and continuationContext.get("section_count", 0) > 0: - # Continuation prompt - simple and focused - section_count = continuationContext.get("section_count", 0) - next_order = continuationContext.get("next_order", 1) - last_content_sample = continuationContext.get("last_content_sample", "") + # Build prompt based on whether this is a continuation or first call + # Check if we have valid continuation context with actual JSON fragment + has_continuation = ( + continuationContext + and continuationContext.get("section_count", 0) > 0 + and continuationContext.get("last_raw_json", "") + and continuationContext.get("last_raw_json", "").strip() != "{}" + ) + + if has_continuation: + # CONTINUATION PROMPT - user already received first part, continue from where it stopped + last_raw_json = continuationContext.get("last_raw_json", "") + last_item_object = continuationContext.get("last_item_object", "") # Full object like {"text": "value"} + last_items_from_fragment = continuationContext.get("last_items_from_fragment", "") + total_items_count = continuationContext.get("total_items_count", 0) - generation_prompt = f"""Continue generating structured JSON content. + # Show the last few items to indicate where to continue (limit fragment size) + # Extract just the ending portion of the JSON to show where it cut off + fragment_snippet = "" + if last_raw_json: + # Show last 1500 chars or the whole thing if shorter - just enough to show the cut point + fragment_snippet = last_raw_json[-1500:] if len(last_raw_json) > 1500 else last_raw_json + # Add ellipsis if truncated + if len(last_raw_json) > 1500: + fragment_snippet = "..." + fragment_snippet + + # Build clear continuation guidance + continuation_guidance = [] + + if total_items_count > 0: + continuation_guidance.append(f"You have already generated {total_items_count} items.") + + # Show the last complete item object (full object format) + if last_item_object: + continuation_guidance.append(f"Last item in previous response: {last_item_object}. Continue with the NEXT item after this.") + + continuation_text = "\n".join(continuation_guidance) if continuation_guidance else "Continue from where it stopped." + + generation_prompt = f"""User request: "{userPrompt}" -ORIGINAL REQUEST: "{userPrompt}" -TARGET FORMAT: {outputFormat} -TITLE: "{title_value}" +The user already received part of the response. Continue generating the remaining content. -CONTEXT - Already generated: -- Total sections generated: {section_count} -- Next section order: {next_order} -- Last content: {last_content_sample} +{continuation_text} -YOUR TASK: -Continue where previous generation stopped. -Generate the NEXT section(s) starting with section_{next_order}. -Generate as much content as possible. +Previous response ended here (JSON was cut off at this point): +```json +{fragment_snippet if fragment_snippet else "(No fragment available)"} +``` -RULES: -- Follow the JSON template structure below exactly -- Fill sections with ACTUAL data based on the user request -- Use appropriate content_type for the data -- Generate REAL content, not summaries or placeholders -- Generate multiple sections if possible - -Return raw JSON (no ```json blocks, no text before/after) - -JSON Template +JSON structure template: {json_template} + +Instructions: +- Return full JSON structure (metadata + documents + sections) +- Continue from where it stopped - add NEW items only, do not repeat old items +- Use the element structures shown in the template +- Generate all remaining content needed to complete the user request +- Fill with actual content (no comments, no "Add more..." text, no placeholders) +- When fully complete, add "complete_response": true at root level +- Return only valid JSON (no comments, no markdown blocks) + +Continue generating: """ else: - # First call - simple prompt without continuation complexity - generation_prompt = f"""Generate structured JSON content for document creation. + # FIRST CALL - initial generation + generation_prompt = f"""User request: "{userPrompt}" -USER REQUEST: "{userPrompt}" -TARGET FORMAT: {outputFormat} -TITLE: "{title_value}" +Generate a NEW, COMPLETE JSON response. The template below shows ONLY the structure pattern - it is NOT existing content. Start from the beginning. -INSTRUCTIONS: -- Follow the JSON template structure below exactly -- Emit only one JSON object in the response -- Fill sections with ACTUAL data based on the user request -- Use appropriate content_type for each section -- Generate REAL content, not summaries or instructions -- Structure content in sections with order 1, 2, 3... -- Each section should be complete before next -- Generate as much content as possible - -Return raw JSON (no ```json blocks, no text before/after) - -JSON Template +JSON structure template (reference only - shows the pattern): {json_template} + +Instructions: +- Start your response with {{"metadata": ...}} - return COMPLETE JSON from the beginning +- Do NOT continue from the template examples above - create your own sections +- Generate content based on the user request +- Use the element structures shown in the template (heading, paragraph, list, table, code) +- Create your own section IDs (do not use the example IDs like "section_heading_example") +- When fully complete, add "complete_response": true at root level +- Return only valid JSON (no comments, no markdown blocks, no text before/after) + +Generate your complete response starting from {{"metadata": ...}}: """ # If we have extracted content, prepend it to the prompt diff --git a/modules/shared/jsonUtils.py b/modules/shared/jsonUtils.py index 92c6dd84..12b044f1 100644 --- a/modules/shared/jsonUtils.py +++ b/modules/shared/jsonUtils.py @@ -138,42 +138,73 @@ def mergeRootLists(json_parts: List[Union[str, Dict, List]]) -> Dict[str, Any]: def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]: """ Attempt to repair broken JSON using multiple strategies. + Generic solution that works for any content type. Returns the best repair attempt or None if all fail. """ if not text: return None - # Strategy 1: Progressive parsing - try to find longest valid prefix + # Strategy 1: Try to extract sections from the entire text first + # This handles cases where the JSON structure is broken but content is intact + extracted_sections = _extractSectionsRegex(text) + if extracted_sections: + logger.info(f"Extracted {len(extracted_sections)} sections using regex") + return { + "metadata": { + "split_strategy": "single_document", + "source_documents": [], + "extraction_method": "ai_generation" + }, + "documents": [{"sections": extracted_sections}] + } + + # Strategy 2: Progressive parsing - try to find longest valid prefix best_result = None best_valid_length = 0 - for i in range(len(text), 0, -1): - test_str = text[:i] - closed_str = _closeJsonStructures(test_str) - obj, err, _ = tryParseJson(closed_str) - if err is None and isinstance(obj, dict): - best_result = obj - best_valid_length = i - logger.debug(f"Progressive parsing success at length {i}") + # Try different step sizes to find the best valid JSON + for step_size in [100, 50, 10, 1]: + for i in range(len(text), 0, -step_size): + test_str = text[:i] + closed_str = _closeJsonStructures(test_str) + obj, err, _ = tryParseJson(closed_str) + if err is None and isinstance(obj, dict): + best_result = obj + best_valid_length = i + logger.debug(f"Progressive parsing success at length {i} (step: {step_size})") + break + if best_result: break if best_result: logger.info(f"Repaired JSON using progressive parsing (valid length: {best_valid_length})") - return best_result + + # Check if we have sections in the result + sections = extractSectionsFromDocument(best_result) + if sections: + logger.info(f"Progressive parsing found {len(sections)} sections") + return best_result + else: + # No sections found in progressive parsing, try to extract from broken part + logger.info("Progressive parsing found no sections, trying to extract from broken part") + extracted_sections = _extractSectionsRegex(text[best_valid_length:]) + if extracted_sections: + logger.info(f"Extracted {len(extracted_sections)} sections from broken part") + # Merge with the valid part + if "documents" not in best_result: + best_result["documents"] = [] + if not best_result["documents"]: + best_result["documents"] = [{"sections": []}] + best_result["documents"][0]["sections"].extend(extracted_sections) + return best_result - # Strategy 2: Structure closing - close incomplete structures + # Strategy 3: Structure closing - close incomplete structures closed_str = _closeJsonStructures(text) obj, err, _ = tryParseJson(closed_str) if err is None and isinstance(obj, dict): logger.info("Repaired JSON using structure closing") return obj - # Strategy 3: Regex extraction (fallback for completely broken JSON) - extracted = _extractSectionsRegex(text) - if extracted: - logger.info("Repaired JSON using regex extraction") - return {"documents": [{"sections": extracted}]} - logger.warning("All repair strategies failed") return None @@ -204,7 +235,7 @@ def _closeJsonStructures(text: str) -> str: def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]: """ Extract sections from broken JSON using regex patterns. - Fallback strategy when JSON is completely corrupted. + Generic solution that works for any content type. """ import re @@ -218,10 +249,10 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]: content_type = match.group(2) order = int(match.group(3)) - # Try to extract elements array + # Try to extract elements array - look for the elements array after this section elements_match = re.search( r'"elements"\s*:\s*\[(.*?)\]', - text[match.end():match.end()+500] # Look ahead for elements + text[match.end():match.end()+5000] # Look ahead for elements (large range) ) elements = [] @@ -230,7 +261,9 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]: elements_str = '[' + elements_match.group(1) + ']' elements = json.loads(elements_str) except: - pass + # If JSON parsing fails, try to extract individual items manually + elements_text = elements_match.group(1) + elements = _extractElementsFromText(elements_text, content_type) sections.append({ "id": section_id, @@ -239,6 +272,243 @@ def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]: "order": order }) + # If no sections found with the main pattern, try to find any content patterns + if not sections: + sections = _extractGenericContent(text) + + return sections + + +def _extractElementsFromText(elements_text: str, content_type: str) -> List[Dict[str, Any]]: + """ + Extract elements from text when JSON parsing fails. + Generic approach that works for any content type. + Handles incomplete strings and corrupted data. + Excludes the last incomplete item to prevent corrupted data. + """ + import re + + elements = [] + + if content_type == "list": + # Look for {"text": "..."} patterns, including incomplete ones + text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elements_text) + # Also look for incomplete patterns like {"text": "36 + incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text) + + # Combine both complete and incomplete items + all_items = text_items + incomplete_items + # Remove duplicates and empty strings + unique_items = list(dict.fromkeys([item for item in all_items if item.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_items: + unique_items = _removeLastIncompleteItem(unique_items, elements_text) + + elements = [{"text": item} for item in unique_items] + + elif content_type == "paragraph": + # Look for {"text": "..."} patterns, including incomplete ones + text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elements_text) + incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text) + + all_items = text_items + incomplete_items + unique_items = list(dict.fromkeys([item for item in all_items if item.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_items: + unique_items = _removeLastIncompleteItem(unique_items, elements_text) + + elements = [{"text": item} for item in unique_items] + + elif content_type == "heading": + # Look for {"level": X, "text": "..."} patterns, including incomplete ones + heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*)"\}', elements_text) + incomplete_heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text) + + all_items = heading_items + incomplete_heading_items + unique_items = list(dict.fromkeys([(int(level), text) for level, text in all_items if text.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_items: + unique_items = _removeLastIncompleteItem(unique_items, elements_text) + + elements = [{"level": level, "text": text} for level, text in unique_items] + + elif content_type == "table": + # Look for table patterns + table_items = re.findall(r'\{"headers"\s*:\s*\[(.*?)\]\s*,\s*"rows"\s*:\s*\[(.*?)\]\s*,\s*"caption"\s*:\s*"([^"]*)"\}', elements_text) + for headers_str, rows_str, caption in table_items: + # Extract headers + headers = re.findall(r'"([^"]+)"', headers_str) + # Extract rows (simplified) + rows = [] + row_matches = re.findall(r'\[(.*?)\]', rows_str) + for row_match in row_matches: + row_items = re.findall(r'"([^"]+)"', row_match) + rows.append(row_items) + + elements.append({ + "headers": headers, + "rows": rows, + "caption": caption + }) + + elif content_type == "code": + # Look for {"code": "...", "language": "..."} patterns, including incomplete ones + code_items = re.findall(r'\{"code"\s*:\s*"([^"]*)"\s*,\s*"language"\s*:\s*"([^"]*)"\}', elements_text) + incomplete_code_items = re.findall(r'\{"code"\s*:\s*"([^"]*?)(?:\n|$)', elements_text) + + all_items = code_items + [(code, "unknown") for code in incomplete_code_items] + unique_items = list(dict.fromkeys([(code, lang) for code, lang in all_items if code.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_items: + unique_items = _removeLastIncompleteItem(unique_items, elements_text) + + elements = [{"code": code, "language": lang} for code, lang in unique_items] + + else: + # Generic fallback - look for any text content, including incomplete + text_items = re.findall(r'"text"\s*:\s*"([^"]*)"', elements_text) + incomplete_text_items = re.findall(r'"text"\s*:\s*"([^"]*?)(?:\n|$)', elements_text) + + all_items = text_items + incomplete_text_items + unique_items = list(dict.fromkeys([item for item in all_items if item.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_items: + unique_items = _removeLastIncompleteItem(unique_items, elements_text) + + elements = [{"text": item} for item in unique_items] + + return elements + + +def _removeLastIncompleteItem(items: List[str], original_text: str) -> List[str]: + """ + Remove the last item if it appears to be incomplete/corrupted. + This prevents corrupted data from being included in the final result. + """ + import re + + if not items: + return items + + # Check if the original text ends with incomplete JSON patterns + # Look for patterns that suggest the last item was cut off + + # Pattern 1: Text ends with incomplete string like {"text": "36 + if re.search(r'\{"[^"]*"\s*:\s*"[^"]*$', original_text): + logger.debug("Detected incomplete string at end - removing last item") + return items[:-1] + + # Pattern 2: Text ends with incomplete boolean like {"bool_flag": tr + if re.search(r'\{"[^"]*"\s*:\s*(true|false|tr|fa)$', original_text): + logger.debug("Detected incomplete boolean at end - removing last item") + return items[:-1] + + # Pattern 3: Text ends with incomplete number like {"number": 123 + if re.search(r'\{"[^"]*"\s*:\s*\d+$', original_text): + logger.debug("Detected incomplete number at end - removing last item") + return items[:-1] + + # Pattern 4: Text ends with incomplete array like {"array": [1,2,3 + if re.search(r'\{"[^"]*"\s*:\s*\[[^\]]*$', original_text): + logger.debug("Detected incomplete array at end - removing last item") + return items[:-1] + + # Pattern 5: Text ends with incomplete object like {"obj": {"key": "val + if re.search(r'\{"[^"]*"\s*:\s*\{[^}]*$', original_text): + logger.debug("Detected incomplete object at end - removing last item") + return items[:-1] + + # Pattern 6: Text ends with trailing comma (common sign of incomplete JSON) + if original_text.rstrip().endswith(','): + logger.debug("Detected trailing comma - removing last item") + return items[:-1] + + # If no incomplete patterns detected, return all items + return items + + +def _extractGenericContent(text: str) -> List[Dict[str, Any]]: + """ + Extract generic content when no specific section patterns are found. + This handles cases where the JSON structure is completely broken. + Handles incomplete strings and corrupted data. + Excludes the last incomplete item to prevent corrupted data. + """ + import re + + sections = [] + + # Look for any structured content patterns + # Pattern 1: Look for list items {"text": "..."}, including incomplete ones + list_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text) + incomplete_list_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text) + + all_list_items = list_items + incomplete_list_items + unique_list_items = list(dict.fromkeys([item for item in all_list_items if item.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_list_items: + unique_list_items = _removeLastIncompleteItem(unique_list_items, text) + + if unique_list_items: + elements = [{"text": item} for item in unique_list_items] + sections.append({ + "id": "section_1", + "content_type": "list", + "elements": elements, + "order": 1 + }) + + # Pattern 2: Look for paragraph text {"text": "..."}, including incomplete ones + elif re.search(r'\{"text"\s*:\s*"[^"]*\}', text): + # Extract all text elements, including incomplete ones + text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text) + incomplete_text_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text) + + all_text_items = text_items + incomplete_text_items + unique_text_items = list(dict.fromkeys([item for item in all_text_items if item.strip()])) + + # Remove the last item if it appears to be incomplete/corrupted + if unique_text_items: + unique_text_items = _removeLastIncompleteItem(unique_text_items, text) + + if unique_text_items: + elements = [{"text": item} for item in unique_text_items] + sections.append({ + "id": "section_1", + "content_type": "paragraph", + "elements": elements, + "order": 1 + }) + + # Pattern 3: Look for any quoted strings that might be content, including incomplete ones + elif re.search(r'"([^"]{3,})"', text): # Strings longer than 3 chars (reduced threshold) + # Extract longer quoted strings, including incomplete ones + text_items = re.findall(r'"([^"]{3,})"', text) + incomplete_text_items = re.findall(r'"([^"]{3,}?)(?:\n|$)', text) + + all_text_items = text_items + incomplete_text_items + # Filter out likely JSON keys + content_items = [item for item in all_text_items if not item.startswith(('section_', 'doc_', 'metadata', 'split_strategy', 'source_documents', 'extraction_method', 'id', 'content_type', 'elements', 'order', 'title', 'filename'))] + + # Remove the last item if it appears to be incomplete/corrupted + if content_items: + content_items = _removeLastIncompleteItem(content_items, text) + + if content_items: + elements = [{"text": item} for item in content_items[:10]] # Limit to first 10 items + sections.append({ + "id": "section_1", + "content_type": "paragraph", + "elements": elements, + "order": 1 + }) + return sections @@ -324,33 +594,295 @@ def extractContentSample(section: Dict[str, Any]) -> str: return "Content exists" -def buildContinuationContext(allSections: List[Dict[str, Any]]) -> Dict[str, Any]: +def _buildDetailedContinuationInfo(section: Dict[str, Any], content_type: str) -> Dict[str, Any]: """ - Build context information from accumulated sections for continuation prompt. - Returns dict with metadata about what was already generated. + Build detailed continuation information for better AI guidance. + Completely generic - works for any content type (list, paragraph, code, table, etc.) """ - if not allSections: + elements = section.get("elements", []) + + if not elements: return { - "section_count": 0, - "next_order": 1, - "last_content_sample": "No content yet" + "type": "continue_general", + "sample": extractContentSample(section), + "last_item": "", + "item_count": 0, + "guidance": "Continue generating content in the same format and style." } - # Sort sections by order - sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0)) + # Count elements regardless of type + element_count = len(elements) - last_section = sorted_sections[-1] - last_order = last_section.get("order", 0) - - # Get content sample from last section - last_content_sample = extractContentSample(last_section) + # Extract sample for context - completely generic + sample = extractContentSample(section) + # Generic continuation guidance - applies to ANY content type + # Tell AI to generate ALL REMAINING content to complete the user request return { - "section_count": len(allSections), - "last_section_id": last_section.get("id", ""), - "last_order": last_order, - "next_order": last_order + 1, - "last_content_type": last_section.get("content_type", ""), - "last_content_sample": last_content_sample + "type": "continue_general", + "sample": sample, + "last_item": "", + "item_count": element_count, + "guidance": "Generate ALL remaining content to complete the user's request. Continue from where you left off and finish everything that was requested." } + +def _extractLastItemsFromFragment(fragment: str, max_items: int = 10) -> str: + """ + Extract the last few items from a JSON fragment for continuation context. + Uses JSON structure (sections -> elements -> items) - fully generic. + Works with broken/incomplete JSON by trying to parse and extract sections. + """ + if not fragment: + return "" + + # Strategy 1: Try to parse as JSON and extract from structure + try: + # Try to repair and parse the fragment + parsed = repairBrokenJson(fragment) + if parsed: + # Extract sections from parsed JSON using structure + sections = extractSectionsFromDocument(parsed) + if sections: + # Get the last section (likely where continuation should happen) + sorted_sections = sorted(sections, key=lambda s: s.get("order", 0)) + last_section = sorted_sections[-1] + elements = last_section.get("elements", []) + + if elements and isinstance(elements, list): + content_type = last_section.get("content_type", "").lower() + + # For list content_type, extract from items array + if content_type == "list" and len(elements) > 0: + last_element = elements[-1] + if isinstance(last_element, dict): + # Check if it has an "items" array (list structure) + if "items" in last_element and isinstance(last_element["items"], list): + items_list = last_element["items"] + if items_list: + # Get last max_items from this items array + last_items = items_list[-max_items:] if len(items_list) > max_items else items_list + # Extract text from each item + texts = [] + for item in last_items: + if isinstance(item, dict) and "text" in item: + texts.append(str(item["text"])) + if texts: + return ', '.join(texts) + + # Or if elements themselves are items (alternative structure) + elif "text" in last_element: + # Get last max_items elements that have text + elements_with_text = [e for e in elements if isinstance(e, dict) and "text" in e] + if elements_with_text: + last_elements = elements_with_text[-max_items:] if len(elements_with_text) > max_items else elements_with_text + texts = [str(e.get("text", "")) for e in last_elements] + if texts: + return ', '.join(texts) + + # For other content types, extract from elements + elif len(elements) > 0: + # Get last max_items elements that have text/code + valid_elements = [e for e in elements if isinstance(e, dict) and ("text" in e or "code" in e)] + if valid_elements: + last_elements = valid_elements[-max_items:] if len(valid_elements) > max_items else valid_elements + texts = [] + for elem in last_elements: + if "text" in elem: + texts.append(str(elem["text"])) + elif "code" in elem: + # For code, show snippet + code = str(elem["code"]) + texts.append(code[:50] + "..." if len(code) > 50 else code) + if texts: + return ', '.join(texts) + except Exception as e: + logger.debug(f"Could not extract items from fragment using JSON structure: {e}") + + # Strategy 2: If parsing failed, try progressive parsing from the end + # Look for the last complete JSON structures near the end + try: + # Try parsing different lengths from the end + for length in [3000, 2000, 1000, 500]: + if len(fragment) > length: + end_portion = fragment[-length:] + closed = _closeJsonStructures(end_portion) + obj, err, _ = tryParseJson(closed) + if err is None and isinstance(obj, dict): + # Successfully parsed - extract sections + sections = extractSectionsFromDocument(obj) + if sections: + # Same extraction logic as above + sorted_sections = sorted(sections, key=lambda s: s.get("order", 0)) + if sorted_sections: + last_section = sorted_sections[-1] + elements = last_section.get("elements", []) + if elements: + # Extract texts using same logic as Strategy 1 + texts = [] + for elem in elements[-max_items:]: + if isinstance(elem, dict): + if "items" in elem and isinstance(elem["items"], list): + # Get last item from items array + if elem["items"]: + last_item = elem["items"][-1] + if isinstance(last_item, dict) and "text" in last_item: + texts.append(str(last_item["text"])) + elif "text" in elem: + texts.append(str(elem["text"])) + if texts: + return ', '.join(texts[-max_items:]) + except Exception as e: + logger.debug(f"Progressive parsing from end failed: {e}") + + # Strategy 3: If all parsing fails, try simple extraction from raw fragment + # Look for last complete {"text": "..."} pattern near the end + try: + # Look at last 2000 chars for the pattern + end_portion = fragment[-2000:] if len(fragment) > 2000 else fragment + # Find all {"text": "value"} patterns + import re + # Pattern to match {"text": "..."} with escaped quotes + pattern = r'\{"text"\s*:\s*"([^"]+)"\}' + matches = re.findall(pattern, end_portion) + if matches: + # Get last max_items + last_matches = matches[-max_items:] if len(matches) > max_items else matches + return ', '.join(last_matches) + except Exception as e: + logger.debug(f"Simple pattern extraction failed: {e}") + + # Strategy 4: If all fails, return empty (will use last_item_from_sections) + return "" + + +def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse: Optional[str] = None) -> Dict[str, Any]: + """ + Build context information from accumulated sections for continuation prompt. + Extracts last items and provides clear continuation point. + + Args: + allSections: List of sections already generated + lastRawResponse: Raw JSON response from last iteration (can be broken/incomplete) + + Returns: + Dict with section_count, last_raw_json, last_items, and continuation point + """ + context = { + "section_count": len(allSections), + } + + # Extract last COMPLETE object directly from raw response (generic - works for any structure) + # This is extracted BEFORE any merging/accumulation happens + # Returns the full last complete object like {"text": "..."} or {"code": "...", "language": "..."} etc. + # Logic: find the last complete {...} where there are no nested { inside (flat object) + last_complete_object = "" # Full object as JSON string + total_items_count = 0 + + if lastRawResponse: + raw_json = stripCodeFences(lastRawResponse.strip()) + if raw_json and raw_json.strip() != "{}": + # Find last complete flat object (no nested objects inside) + # Scan from the end backwards to find the last complete {...} object + # A flat object is complete if: starts with {, ends with }, and has no nested { inside + + # Work backwards from the end, find last } + for i in range(len(raw_json) - 1, -1, -1): + if raw_json[i] == '}': + # Found a closing brace, work backwards to find its opening brace + depth = 1 + opening_pos = -1 + + for j in range(i - 1, -1, -1): + if raw_json[j] == '}': + depth += 1 + elif raw_json[j] == '{': + depth -= 1 + if depth == 0: + # Found matching opening brace + opening_pos = j + # Check if this is a flat object (no nested { inside) + obj_content = raw_json[j + 1:i] + if '{' not in obj_content: + # This is a flat object (no nested objects inside) + last_complete_object = raw_json[j:i + 1] + break + + if last_complete_object: + break + + # Also try structure-based parsing for item count + try: + parsed = repairBrokenJson(raw_json) + if parsed: + sections = extractSectionsFromDocument(parsed) + if sections: + sorted_sections = sorted(sections, key=lambda s: s.get("order", 0)) + last_section = sorted_sections[-1] + elements = last_section.get("elements", []) + + if elements and isinstance(elements, list) and len(elements) > 0: + if last_section.get("content_type") == "list": + last_element = elements[-1] + if isinstance(last_element, dict): + if "items" in last_element and isinstance(last_element["items"], list): + items_list = last_element["items"] + # Only count complete items (those successfully extracted) + total_items_count = len(items_list) + except Exception as e: + logger.debug(f"Could not extract item count from raw response structure: {e}") + + # Also extract last items for display (fragment extraction) + last_items_from_fragment = _extractLastItemsFromFragment(raw_json, max_items=10) + + context["last_raw_json"] = raw_json + context["last_item_object"] = last_complete_object # Full last complete object (generic - any structure) + context["last_items_from_fragment"] = last_items_from_fragment + context["total_items_count"] = total_items_count # Count from raw response + + logger.debug(f"Included previous JSON response in continuation context ({len(raw_json)} chars, {total_items_count} items in response, last complete object: {last_complete_object})") + else: + logger.warning("lastRawResponse was empty or just '{}' - continuation may not work correctly") + else: + # No raw response - fallback to extracting from accumulated sections + # Extract the last complete object from the last element + last_item_object_from_sections = "" + if allSections: + sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0)) + last_section = sorted_sections[-1] + elements = last_section.get("elements", []) + + if elements and isinstance(elements, list) and len(elements) > 0: + # Get the last element (could be any structure - generic) + last_element = elements[-1] + if isinstance(last_element, dict): + # Try to get items if it's a list structure + if "items" in last_element and isinstance(last_element["items"], list): + items_list = last_element["items"] + total_items_count = len(items_list) + if items_list: + # Get last item (any structure) + last_item = items_list[-1] + if isinstance(last_item, dict): + # Convert to JSON string (generic - works for any object structure) + import json + try: + last_item_object_from_sections = json.dumps(last_item) + except: + pass + else: + # Element itself is the object (no items array) + total_items_count = len(elements) + # Convert to JSON string (generic) + import json + try: + last_item_object_from_sections = json.dumps(last_element) + except: + pass + + context["last_item_object"] = last_item_object_from_sections + context["total_items_count"] = total_items_count + logger.debug(f"No previous raw response available for continuation context (but have {total_items_count} items accumulated, last item object: {last_item_object_from_sections})") + + return context + diff --git a/test4_method_ai_operations.py b/test4_method_ai_operations.py index e0bd5861..dc09ea9a 100644 --- a/test4_method_ai_operations.py +++ b/test4_method_ai_operations.py @@ -50,7 +50,7 @@ class MethodAiOperationsTester: "resultType": "json" }, OperationTypeEnum.DATA_GENERATE: { - "aiPrompt": "Generate the first 9000 prime numbers.", + "aiPrompt": "Generate the first 4000 prime numbers.", "resultType": "txt" }, OperationTypeEnum.DATA_EXTRACT: {