fixed json looping context strings for cut point

This commit is contained in:
ValueOn AG 2026-01-05 01:30:27 +01:00
parent c20e65ad4e
commit 879a3c0eff
9 changed files with 2036 additions and 370 deletions

File diff suppressed because it is too large Load diff

View file

@ -262,7 +262,11 @@ class JsonMergeLogger:
JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" {line}")
JsonMergeLogger._log(" " + "="*76) JsonMergeLogger._log(" " + "="*76)
else: else:
JsonMergeLogger._log(f" Accumulated suffix (COMPLETE): {accSuffix}") # For lists/arrays, only log summary to avoid log flooding
if isinstance(accSuffix, list):
JsonMergeLogger._log(f" Accumulated suffix: list with {len(accSuffix)} items")
else:
JsonMergeLogger._log(f" Accumulated suffix: {type(accSuffix).__name__}")
if fragPrefix is not None: if fragPrefix is not None:
if isinstance(fragPrefix, str): if isinstance(fragPrefix, str):
prefixLines = fragPrefix.split('\n') prefixLines = fragPrefix.split('\n')
@ -278,7 +282,11 @@ class JsonMergeLogger:
for line in prefixLines: for line in prefixLines:
JsonMergeLogger._log(f" {line}") JsonMergeLogger._log(f" {line}")
else: else:
JsonMergeLogger._log(f" Fragment prefix (COMPLETE): {fragPrefix}") # For lists/arrays, only log summary to avoid log flooding
if isinstance(fragPrefix, list):
JsonMergeLogger._log(f" Fragment prefix: list with {len(fragPrefix)} items")
else:
JsonMergeLogger._log(f" Fragment prefix: {type(fragPrefix).__name__}")
else: else:
JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all") JsonMergeLogger._log(f" ⚠️ No overlap detected - appending all")
@ -1903,13 +1911,32 @@ class ModularJsonMerger:
def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str: def _mergeStrings(accStr: str, fragStr: str, overlapLength: int) -> str:
""" """
Merge two JSON strings together, removing the overlap. Merge two JSON strings together, removing the overlap.
Handles whitespace at cut points properly for seamless merging.
""" """
if overlapLength > 0: if overlapLength > 0:
# Remove overlap from fragment and append # Remove overlap from fragment and append
# CRITICAL: Handle whitespace properly - if accumulated ends with whitespace
# and fragment starts with the same content, we need to preserve whitespace structure
merged = accStr + fragStr[overlapLength:] merged = accStr + fragStr[overlapLength:]
else: else:
# No overlap - just concatenate (might need comma or other separator) # No overlap - just concatenate (might need comma or other separator)
# Try to add comma if needed # CRITICAL: Preserve whitespace structure when merging
# Get trailing whitespace from accumulated (spaces, tabs, but not newlines)
accTrailingWs = ""
i = len(accStr) - 1
while i >= 0 and accStr[i] in [' ', '\t']:
accTrailingWs = accStr[i] + accTrailingWs
i -= 1
# Get leading whitespace from fragment (spaces, tabs, but not newlines)
fragLeadingWs = ""
i = 0
while i < len(fragStr) and fragStr[i] in [' ', '\t']:
fragLeadingWs += fragStr[i]
i += 1
# Trim for content detection but preserve whitespace structure
accTrimmed = accStr.rstrip().rstrip(',') accTrimmed = accStr.rstrip().rstrip(',')
fragTrimmed = fragStr.lstrip().lstrip(',') fragTrimmed = fragStr.lstrip().lstrip(',')
@ -1917,10 +1944,14 @@ class ModularJsonMerger:
if accTrimmed and fragTrimmed: if accTrimmed and fragTrimmed:
# If accumulated ends with } or ] and fragment starts with { or [, we might need comma # If accumulated ends with } or ] and fragment starts with { or [, we might need comma
if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['): if (accTrimmed[-1] in '}]' and fragTrimmed[0] in '{['):
merged = accTrimmed + ',' + fragTrimmed # Add comma with appropriate whitespace
merged = accTrimmed + ',' + fragLeadingWs + fragTrimmed
else: else:
merged = accTrimmed + fragTrimmed # Merge with preserved whitespace structure
# Use the whitespace from fragment (it knows the proper spacing)
merged = accTrimmed + accTrailingWs + fragLeadingWs + fragTrimmed
else: else:
# One is empty - just concatenate with preserved whitespace
merged = accStr + fragStr merged = accStr + fragStr
return merged return merged

View file

@ -2198,16 +2198,66 @@ Output requirements:
incompletePart = continuationContext.incomplete_part incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json lastRawJson = continuationContext.last_raw_json
# Build overlap context: extract last ~100 characters from the response for overlap # Build overlap context: extract cut part and full part before (same level) for overlap
overlapContext = "" overlapContext = ""
if lastRawJson: if lastRawJson:
overlapContext = lastRawJson[-100:].strip() # Find break position in raw JSON
lastCompletePart = continuationContext.last_complete_part
breakPos = len(lastRawJson.rstrip())
if lastCompletePart:
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
normalizedRaw = stripCodeFences(normalizeJsonText(lastRawJson)).strip()
normalizedComplete = stripCodeFences(normalizeJsonText(lastCompletePart)).strip()
# Find where normalizedComplete ends in normalizedRaw
pos = normalizedRaw.find(normalizedComplete)
if pos >= 0:
breakPos = pos + len(normalizedComplete)
else:
pos = lastRawJson.find(lastCompletePart)
if pos >= 0:
breakPos = pos + len(lastCompletePart)
elif incompletePart:
pos = lastRawJson.find(incompletePart)
if pos >= 0:
breakPos = pos
# Extract cut part and full part before (same level)
overlapContext = self._extractOverlapContext(lastRawJson, breakPos)
# Build unified context showing structure hierarchy with cut point # Build unified context showing structure hierarchy with cut point
unifiedContext = "" unifiedContext = ""
if lastRawJson: if lastRawJson:
# Find break position in raw JSON # Find break position in raw JSON
if incompletePart: # Use last_complete_part length to find where complete part ends
lastCompletePart = continuationContext.last_complete_part
if lastCompletePart:
# Break position is where the complete part ends
# Normalize lastRawJson to match the normalized lastCompletePart
from modules.shared.jsonUtils import stripCodeFences, normalizeJsonText
normalizedRaw = stripCodeFences(normalizeJsonText(lastRawJson)).strip()
normalizedComplete = stripCodeFences(normalizeJsonText(lastCompletePart)).strip()
# Find where normalizedComplete ends in normalizedRaw
breakPos = normalizedRaw.find(normalizedComplete)
if breakPos >= 0:
breakPos = breakPos + len(normalizedComplete)
else:
# Fallback: use length of lastCompletePart in original string
breakPos = lastRawJson.find(lastCompletePart)
if breakPos >= 0:
breakPos = breakPos + len(lastCompletePart)
else:
# Last resort: use incompletePart position
if incompletePart:
breakPos = lastRawJson.find(incompletePart)
if breakPos == -1:
breakPos = len(lastRawJson.rstrip())
else:
breakPos = len(lastRawJson.rstrip())
elif incompletePart:
# If no complete part, find where incomplete part starts
breakPos = lastRawJson.find(incompletePart) breakPos = lastRawJson.find(incompletePart)
if breakPos == -1: if breakPos == -1:
breakPos = len(lastRawJson.rstrip()) breakPos = len(lastRawJson.rstrip())
@ -2215,8 +2265,8 @@ Output requirements:
breakPos = len(lastRawJson.rstrip()) breakPos = len(lastRawJson.rstrip())
# Build intelligent context showing hierarchy # Build intelligent context showing hierarchy
from modules.shared.jsonUtils import _buildIncompleteContext from modules.shared.jsonUtils import buildIncompleteContext
unifiedContext = _buildIncompleteContext(lastRawJson, breakPos) unifiedContext = buildIncompleteContext(lastRawJson, breakPos)
elif incompletePart: elif incompletePart:
unifiedContext = incompletePart unifiedContext = incompletePart
else: else:
@ -2229,29 +2279,43 @@ Output requirements:
The previous JSON response was incomplete. Continue from where it stopped. The previous JSON response was incomplete. Continue from where it stopped.
JSON Structure Template: JSON Structure Template:
```json
{templateStructure} {templateStructure}
```
Context showing structure hierarchy with cut point: Context showing structure hierarchy with cut point:
```
{unifiedContext} {unifiedContext}
```
Overlap Requirement: Overlap Requirement:
To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content. To ensure proper merging, your response MUST start by repeating the cut part and the full part before (same level) shown below, then continue with new content.
Last ~100 characters from previous response (repeat these at the start): Overlap context (cut part and full part before at same level):
```json
{overlapContext if overlapContext else "No overlap context available"} {overlapContext if overlapContext else "No overlap context available"}
```
TASK: TASK:
1. Start your response by repeating the last ~100 characters shown above (for overlap/merging) 1. Start your response by repeating the overlap context shown above (cut part and full part before at same level)
2. Complete the incomplete element shown in the context above (marked with CUT POINT) 2. Complete the incomplete element shown in the context above (marked with CUT POINT)
3. Continue generating the remaining content following the JSON structure template above 3. Continue generating the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects 4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL: CRITICAL:
- Your response must be valid JSON matching the structure template above - Your response must be valid JSON matching the structure template above
- Start with overlap (~100 chars) then continue seamlessly - Start with overlap context (cut part and full part before at same level) then continue seamlessly
- Complete the incomplete element and continue with remaining elements""" - Complete the incomplete element and continue with remaining elements"""
return continuationPrompt return continuationPrompt
def _extractOverlapContext(self, jsonContent: str, breakPosition: int) -> str:
"""
Extract overlap context: cut part and full part before (same level).
Delegates to shared function in jsonUtils for consistency.
"""
from modules.shared.jsonUtils import extractOverlapContext
return extractOverlapContext(jsonContent, breakPosition)
def _extractAndMergeMultipleJsonBlocks(self, responseText: str, contentType: str, sectionId: str) -> List[Dict[str, Any]]: def _extractAndMergeMultipleJsonBlocks(self, responseText: str, contentType: str, sectionId: str) -> List[Dict[str, Any]]:
""" """
Extract multiple JSON blocks from response and merge them appropriately. Extract multiple JSON blocks from response and merge them appropriately.

View file

@ -128,10 +128,18 @@ class StructureGenerator:
incompletePart = continuationContext.incomplete_part incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json lastRawJson = continuationContext.last_raw_json
# Build overlap context: extract last ~100 characters from the response for overlap # Build overlap context: extract cut part and full part before (same level) for overlap
overlapContext = "" overlapContext = ""
if lastRawJson: if lastRawJson:
overlapContext = lastRawJson[-100:].strip() # Find break position
breakPos = len(lastRawJson.rstrip())
if incompletePart:
pos = lastRawJson.find(incompletePart)
if pos >= 0:
breakPos = pos
# Extract cut part and full part before (same level)
overlapContext = StructureGenerator._extractOverlapContext(lastRawJson, breakPos)
# Build unified context showing structure hierarchy with cut point # Build unified context showing structure hierarchy with cut point
unifiedContext = "" unifiedContext = ""
@ -145,8 +153,8 @@ class StructureGenerator:
breakPos = len(lastRawJson.rstrip()) breakPos = len(lastRawJson.rstrip())
# Build intelligent context showing hierarchy # Build intelligent context showing hierarchy
from modules.shared.jsonUtils import _buildIncompleteContext from modules.shared.jsonUtils import buildIncompleteContext
unifiedContext = _buildIncompleteContext(lastRawJson, breakPos) unifiedContext = buildIncompleteContext(lastRawJson, breakPos)
elif incompletePart: elif incompletePart:
unifiedContext = incompletePart unifiedContext = incompletePart
else: else:
@ -159,28 +167,172 @@ class StructureGenerator:
The previous JSON response was incomplete. Continue from where it stopped. The previous JSON response was incomplete. Continue from where it stopped.
JSON Structure Template: JSON Structure Template:
```json
{templateStructure} {templateStructure}
```
Context showing structure hierarchy with cut point: Context showing structure hierarchy with cut point:
```
{unifiedContext} {unifiedContext}
```
Overlap Requirement: Overlap Requirement:
To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content. To ensure proper merging, your response MUST start by repeating the cut part and the full part before (same level) shown below, then continue with new content.
Last ~100 characters from previous response (repeat these at the start): Overlap context (cut part and full part before at same level):
```json
{overlapContext if overlapContext else "No overlap context available"} {overlapContext if overlapContext else "No overlap context available"}
```
TASK: TASK:
1. Start your response by repeating the last ~100 characters shown above (for overlap/merging) 1. Start your response by repeating the overlap context shown above (cut part and full part before at same level)
2. Complete the incomplete element shown in the context above (marked with CUT POINT) 2. Complete the incomplete element shown in the context above (marked with CUT POINT)
3. Continue generating the remaining content following the JSON structure template above 3. Continue generating the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects 4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL: CRITICAL:
- Your response must be valid JSON matching the structure template above - Your response must be valid JSON matching the structure template above
- Start with overlap (~100 chars) then continue seamlessly - Start with overlap context (cut part and full part before at same level) then continue seamlessly
- Complete the incomplete element and continue with remaining elements""" - Complete the incomplete element and continue with remaining elements"""
return continuationPrompt return continuationPrompt
"""
Extract overlap context: cut part and full part before (same level).
Returns a string showing:
1. The last complete element at the same level before the cut point
2. The cut part (incomplete element at the cut point)
"""
if not jsonContent or breakPosition <= 0:
return jsonContent[-200:].strip() if jsonContent else ""
from modules.shared.jsonUtils import findStructureHierarchy, extractCutPiece
# Find structure hierarchy
hierarchy = findStructureHierarchy(jsonContent, breakPosition)
if not hierarchy:
# Fallback: show last 200 chars before break
start = max(0, breakPosition - 200)
return jsonContent[start:breakPosition + 100].strip()
# Get cut level (the array/object containing the cut piece)
cutLevel = hierarchy[-1]
cutLevelStart = cutLevel['start_pos']
cutLevelType = cutLevel['type']
# Extract cut piece (incomplete element)
cutPiece = extractCutPiece(jsonContent, breakPosition)
# Find the last complete element at the same level before the cut point
overlapParts = []
if cutLevelType == 'array':
# Find the last complete array element before breakPosition
i = breakPosition - 1
depth = 0
inString = False
escapeNext = False
elementStart = breakPosition
# Find the start of the incomplete element (or last complete element)
while i >= cutLevelStart:
char = jsonContent[i]
if escapeNext:
escapeNext = False
i -= 1
continue
if char == '\\':
escapeNext = True
i -= 1
continue
if char == '"':
inString = not inString
i -= 1
continue
if not inString:
if char == ']':
depth += 1
elif char == '[':
depth -= 1
if depth < 0:
elementStart = i + 1
break
elif char == ',' and depth == 0:
elementStart = i + 1
break
i -= 1
# Extract the last complete element (if exists) and the cut part
if elementStart < breakPosition:
contentBeforeBreak = jsonContent[max(cutLevelStart, elementStart - 500):breakPosition].strip()
# Find the last complete element by looking for balanced brackets/braces
lastCompleteEnd = breakPosition
braceCount = 0
bracketCount = 0
inString = False
escapeNext = False
# Go backwards from breakPosition to find where last complete element ends
for j in range(breakPosition - 1, max(cutLevelStart, breakPosition - 1000), -1):
char = jsonContent[j]
if escapeNext:
escapeNext = False
continue
if char == '\\':
escapeNext = True
continue
if char == '"':
inString = not inString
continue
if not inString:
if char == '}':
braceCount += 1
elif char == '{':
braceCount -= 1
if braceCount == 0 and bracketCount == 0:
lastCompleteEnd = j
break
elif char == ']':
bracketCount += 1
elif char == '[':
bracketCount -= 1
if bracketCount == 0 and braceCount == 0:
lastCompleteEnd = j + 1
break
elif char == ',' and braceCount == 0 and bracketCount == 0:
lastCompleteEnd = j + 1
break
# Extract last complete element and cut part
if lastCompleteEnd < breakPosition:
lastCompleteElement = jsonContent[max(cutLevelStart, lastCompleteEnd - 300):lastCompleteEnd].strip()
cutPart = jsonContent[lastCompleteEnd:breakPosition + len(cutPiece)].strip()
if lastCompleteElement:
overlapParts.append(f"Last complete element at same level:\n{lastCompleteElement}")
if cutPart:
overlapParts.append(f"Cut part (incomplete):\n{cutPart}")
else:
contextStart = max(cutLevelStart, breakPosition - 300)
overlapParts.append(jsonContent[contextStart:breakPosition + len(cutPiece)].strip())
else:
contextStart = max(cutLevelStart, breakPosition - 300)
overlapParts.append(jsonContent[contextStart:breakPosition + len(cutPiece)].strip())
else:
# For objects or other types, show context around break point
contextStart = max(cutLevelStart, breakPosition - 300)
overlapParts.append(jsonContent[contextStart:breakPosition + len(cutPiece)].strip())
return "\n\n".join(overlapParts) if overlapParts else jsonContent[max(0, breakPosition - 200):breakPosition + 100].strip()
# Call AI with looping support # Call AI with looping support
# NOTE: Do NOT pass contentParts here - we only need metadata for structure generation # NOTE: Do NOT pass contentParts here - we only need metadata for structure generation
@ -304,6 +456,15 @@ CRITICAL:
logger.error(f"Error in generateStructure: {str(e)}") logger.error(f"Error in generateStructure: {str(e)}")
raise raise
@staticmethod
def _extractOverlapContext(jsonContent: str, breakPosition: int) -> str:
"""
Extract overlap context: cut part and full part before (same level).
Delegates to shared function in jsonUtils for consistency.
"""
from modules.shared.jsonUtils import extractOverlapContext
return extractOverlapContext(jsonContent, breakPosition)
def _buildChapterStructurePrompt( def _buildChapterStructurePrompt(
self, self,
userPrompt: str, userPrompt: str,

View file

@ -26,6 +26,15 @@ class CodeGenerationPath:
def __init__(self, services): def __init__(self, services):
self.services = services self.services = services
@staticmethod
def _extractOverlapContext(jsonContent: str, breakPosition: int) -> str:
"""
Extract overlap context: cut part and full part before (same level).
Delegates to shared function in jsonUtils for consistency.
"""
from modules.shared.jsonUtils import extractOverlapContext
return extractOverlapContext(jsonContent, breakPosition)
async def generateCode( async def generateCode(
self, self,
userPrompt: str, userPrompt: str,
@ -354,8 +363,8 @@ Return ONLY valid JSON matching the request above.
breakPos = len(lastRawJson.rstrip()) breakPos = len(lastRawJson.rstrip())
# Build intelligent context showing hierarchy # Build intelligent context showing hierarchy
from modules.shared.jsonUtils import _buildIncompleteContext from modules.shared.jsonUtils import buildIncompleteContext
unifiedContext = _buildIncompleteContext(lastRawJson, breakPos) unifiedContext = buildIncompleteContext(lastRawJson, breakPos)
elif incompletePart: elif incompletePart:
unifiedContext = incompletePart unifiedContext = incompletePart
else: else:
@ -368,26 +377,32 @@ Return ONLY valid JSON matching the request above.
The previous JSON response was incomplete. Continue from where it stopped. The previous JSON response was incomplete. Continue from where it stopped.
JSON Structure Template: JSON Structure Template:
```json
{templateStructure} {templateStructure}
```
Context showing structure hierarchy with cut point: Context showing structure hierarchy with cut point:
```
{unifiedContext} {unifiedContext}
```
Overlap Requirement: Overlap Requirement:
To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content. To ensure proper merging, your response MUST start by repeating the cut part and the full part before (same level) shown below, then continue with new content.
Last ~100 characters from previous response (repeat these at the start): Overlap context (cut part and full part before at same level):
```json
{overlapContext if overlapContext else "No overlap context available"} {overlapContext if overlapContext else "No overlap context available"}
```
TASK: TASK:
1. Start your response by repeating the last ~100 characters shown above (for overlap/merging) 1. Start your response by repeating the overlap context shown above (cut part and full part before at same level)
2. Complete the incomplete element shown in the context above (marked with CUT POINT) 2. Complete the incomplete element shown in the context above (marked with CUT POINT)
3. Continue generating the remaining content following the JSON structure template above 3. Continue generating the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects 4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL: CRITICAL:
- Your response must be valid JSON matching the structure template above - Your response must be valid JSON matching the structure template above
- Start with overlap (~100 chars) then continue seamlessly - Start with overlap context (cut part and full part before at same level) then continue seamlessly
- Complete the incomplete element and continue with remaining elements""" - Complete the incomplete element and continue with remaining elements"""
return continuationPrompt return continuationPrompt
@ -793,10 +808,18 @@ Return ONLY valid JSON in this format:
incompletePart = continuationContext.incomplete_part incompletePart = continuationContext.incomplete_part
lastRawJson = continuationContext.last_raw_json lastRawJson = continuationContext.last_raw_json
# Build overlap context: extract last ~100 characters from the response for overlap # Build overlap context: extract cut part and full part before (same level) for overlap
overlapContext = "" overlapContext = ""
if lastRawJson: if lastRawJson:
overlapContext = lastRawJson[-100:].strip() # Find break position
breakPos = len(lastRawJson.rstrip())
if incompletePart:
pos = lastRawJson.find(incompletePart)
if pos >= 0:
breakPos = pos
# Extract cut part and full part before (same level)
overlapContext = CodeGenerationPath._extractOverlapContext(lastRawJson, breakPos)
# Build unified context showing structure hierarchy with cut point # Build unified context showing structure hierarchy with cut point
unifiedContext = "" unifiedContext = ""
@ -810,8 +833,8 @@ Return ONLY valid JSON in this format:
breakPos = len(lastRawJson.rstrip()) breakPos = len(lastRawJson.rstrip())
# Build intelligent context showing hierarchy # Build intelligent context showing hierarchy
from modules.shared.jsonUtils import _buildIncompleteContext from modules.shared.jsonUtils import buildIncompleteContext
unifiedContext = _buildIncompleteContext(lastRawJson, breakPos) unifiedContext = buildIncompleteContext(lastRawJson, breakPos)
elif incompletePart: elif incompletePart:
unifiedContext = incompletePart unifiedContext = incompletePart
else: else:
@ -824,26 +847,32 @@ Return ONLY valid JSON in this format:
The previous JSON response was incomplete. Continue from where it stopped. The previous JSON response was incomplete. Continue from where it stopped.
JSON Structure Template: JSON Structure Template:
```json
{templateStructure} {templateStructure}
```
Context showing structure hierarchy with cut point: Context showing structure hierarchy with cut point:
```
{unifiedContext} {unifiedContext}
```
Overlap Requirement: Overlap Requirement:
To ensure proper merging, your response MUST start by repeating approximately the last 100 characters from the previous response, then continue with new content. To ensure proper merging, your response MUST start by repeating the cut part and the full part before (same level) shown below, then continue with new content.
Last ~100 characters from previous response (repeat these at the start): Overlap context (cut part and full part before at same level):
```json
{overlapContext if overlapContext else "No overlap context available"} {overlapContext if overlapContext else "No overlap context available"}
```
TASK: TASK:
1. Start your response by repeating the last ~100 characters shown above (for overlap/merging) 1. Start your response by repeating the overlap context shown above (cut part and full part before at same level)
2. Complete the incomplete element shown in the context above (marked with CUT POINT) 2. Complete the incomplete element shown in the context above (marked with CUT POINT)
3. Continue generating the remaining content following the JSON structure template above 3. Continue generating the remaining content following the JSON structure template above
4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects 4. Return ONLY valid JSON following the structure template - no overlap/continuation wrapper objects
CRITICAL: CRITICAL:
- Your response must be valid JSON matching the structure template above - Your response must be valid JSON matching the structure template above
- Start with overlap (~100 chars) then continue seamlessly - Start with overlap context (cut part and full part before at same level) then continue seamlessly
- Complete the incomplete element and continue with remaining elements""" - Complete the incomplete element and continue with remaining elements"""
return continuationPrompt return continuationPrompt

View file

@ -346,9 +346,18 @@ class BaseRenderer(ABC):
response = await aiService.callAi(request) response = await aiService.callAi(request)
# Save styling prompt and response to debug # Save styling prompt and response to debug (fire and forget - don't block on slow file I/O)
self.services.utils.writeDebugFile(styleTemplate, "renderer_styling_prompt") # The writeDebugFile calls os.listdir() which can be slow with many files
self.services.utils.writeDebugFile(response.content or '', "renderer_styling_response") # Run in background thread to avoid blocking rendering
import threading
def _writeDebugFiles():
try:
self.services.utils.writeDebugFile(styleTemplate, "renderer_styling_prompt")
self.services.utils.writeDebugFile(response.content or '', "renderer_styling_response")
except Exception:
pass # Silently fail - debug writing should never block rendering
threading.Thread(target=_writeDebugFiles, daemon=True).start()
# Clean and parse JSON # Clean and parse JSON
result = response.content.strip() if response and response.content else "" result = response.content.strip() if response and response.content else ""

View file

@ -116,24 +116,37 @@ class RendererDocx(BaseRenderer):
async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str: async def _generateDocxFromJson(self, json_content: Dict[str, Any], title: str, userPrompt: str = None, aiService=None) -> str:
"""Generate DOCX content from structured JSON document.""" """Generate DOCX content from structured JSON document."""
import time
start_time = time.time()
try: try:
self.logger.debug("_generateDocxFromJson: Starting document generation")
# Create new document # Create new document
doc = Document() doc = Document()
self.logger.debug(f"_generateDocxFromJson: Document created in {time.time() - start_time:.2f}s")
# Get style set: use styles from metadata if available, otherwise enhance with AI # Get style set: use styles from metadata if available, otherwise enhance with AI
style_start = time.time()
self.logger.debug("_generateDocxFromJson: About to get style set")
styleSet = await self._getStyleSet(json_content, userPrompt, aiService) styleSet = await self._getStyleSet(json_content, userPrompt, aiService)
self.logger.debug(f"_generateDocxFromJson: Style set retrieved in {time.time() - style_start:.2f}s")
# Setup basic document styles and create all styles from style set # Setup basic document styles and create all styles from style set
setup_start = time.time()
self.logger.debug("_generateDocxFromJson: Setting up document styles")
self._setupBasicDocumentStyles(doc) self._setupBasicDocumentStyles(doc)
self._setupDocumentStyles(doc, styleSet) self._setupDocumentStyles(doc, styleSet)
self.logger.debug(f"_generateDocxFromJson: Document styles setup in {time.time() - setup_start:.2f}s")
# Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]}) # Validate JSON structure (standardized schema: {metadata: {...}, documents: [{sections: [...]}]})
if not self._validateJsonStructure(json_content): if not self._validateJsonStructure(json_content):
raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}") raise ValueError("JSON content must follow standardized schema: {metadata: {...}, documents: [{sections: [...]}]}")
# Extract sections and metadata from standardized schema # Extract sections and metadata from standardized schema
extract_start = time.time()
self.logger.debug("_generateDocxFromJson: Extracting sections and metadata")
sections = self._extractSections(json_content) sections = self._extractSections(json_content)
metadata = self._extractMetadata(json_content) metadata = self._extractMetadata(json_content)
self.logger.debug(f"_generateDocxFromJson: Extracted {len(sections)} sections in {time.time() - extract_start:.2f}s")
# Use provided title (which comes from documents[].title) as primary source # Use provided title (which comes from documents[].title) as primary source
# Fallback to metadata.title only if title parameter is empty # Fallback to metadata.title only if title parameter is empty
@ -144,18 +157,32 @@ class RendererDocx(BaseRenderer):
doc.add_paragraph(document_title, style='Title') doc.add_paragraph(document_title, style='Title')
# Process each section in order # Process each section in order
for section in sections: render_start = time.time()
self.logger.debug(f"_generateDocxFromJson: Starting to render {len(sections)} sections")
for idx, section in enumerate(sections):
section_start = time.time()
self.logger.debug(f"_generateDocxFromJson: Rendering section {idx + 1}/{len(sections)}")
self._renderJsonSection(doc, section, styleSet) self._renderJsonSection(doc, section, styleSet)
self.logger.debug(f"_generateDocxFromJson: Section {idx + 1} rendered in {time.time() - section_start:.2f}s")
self.logger.debug(f"_generateDocxFromJson: All sections rendered in {time.time() - render_start:.2f}s")
# Save to buffer # Save to buffer
save_start = time.time()
self.logger.debug("_generateDocxFromJson: Starting to save document to buffer")
buffer = io.BytesIO() buffer = io.BytesIO()
doc.save(buffer) doc.save(buffer)
buffer.seek(0) buffer.seek(0)
self.logger.debug(f"_generateDocxFromJson: Document saved to buffer in {time.time() - save_start:.2f}s")
# Convert to base64 # Convert to base64
encode_start = time.time()
self.logger.debug("_generateDocxFromJson: Converting to base64")
docx_bytes = buffer.getvalue() docx_bytes = buffer.getvalue()
docx_base64 = base64.b64encode(docx_bytes).decode('utf-8') docx_base64 = base64.b64encode(docx_bytes).decode('utf-8')
self.logger.debug(f"_generateDocxFromJson: Converted to base64 in {time.time() - encode_start:.2f}s (document size: {len(docx_bytes)} bytes)")
total_time = time.time() - start_time
self.logger.info(f"_generateDocxFromJson: Document generation completed in {total_time:.2f}s")
return docx_base64 return docx_base64
except Exception as e: except Exception as e:
@ -381,6 +408,8 @@ class RendererDocx(BaseRenderer):
def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None: def _renderJsonTable(self, doc: Document, table_data: Dict[str, Any], styles: Dict[str, Any]) -> None:
"""Render a JSON table to DOCX using AI-generated styles.""" """Render a JSON table to DOCX using AI-generated styles."""
import time
table_start = time.time()
try: try:
# Extract from nested content structure # Extract from nested content structure
content = table_data.get("content", {}) content = table_data.get("content", {})
@ -392,19 +421,26 @@ class RendererDocx(BaseRenderer):
if not headers or not rows: if not headers or not rows:
return return
self.logger.debug(f"_renderJsonTable: Starting table render - {len(rows)} rows × {len(headers)} columns = {len(rows) * len(headers)} cells")
# Create table # Create table
create_start = time.time()
table = doc.add_table(rows=len(rows) + 1, cols=len(headers)) table = doc.add_table(rows=len(rows) + 1, cols=len(headers))
table.alignment = WD_TABLE_ALIGNMENT.CENTER table.alignment = WD_TABLE_ALIGNMENT.CENTER
self.logger.debug(f"_renderJsonTable: Table created in {time.time() - create_start:.2f}s")
# Apply table borders based on AI style # Apply table borders based on AI style
border_start = time.time()
border_style = styles["table_border"]["style"] border_style = styles["table_border"]["style"]
if border_style == "horizontal_only": if border_style == "horizontal_only":
self._applyHorizontalBordersOnly(table) self._applyHorizontalBordersOnly(table)
elif border_style == "grid": elif border_style == "grid":
table.style = 'Table Grid' table.style = 'Table Grid'
# else: no borders # else: no borders
self.logger.debug(f"_renderJsonTable: Borders applied in {time.time() - border_start:.2f}s")
# Add headers with AI-generated styling # Add headers with AI-generated styling
header_start = time.time()
header_row = table.rows[0] header_row = table.rows[0]
header_style = styles["table_header"] header_style = styles["table_header"]
for i, header in enumerate(headers): for i, header in enumerate(headers):
@ -424,9 +460,14 @@ class RendererDocx(BaseRenderer):
run.font.size = Pt(11) run.font.size = Pt(11)
text_color = header_style["text_color"].lstrip('#') text_color = header_style["text_color"].lstrip('#')
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
self.logger.debug(f"_renderJsonTable: Headers rendered in {time.time() - header_start:.2f}s")
# Add data rows with AI-generated styling # Add data rows with AI-generated styling
rows_start = time.time()
cell_style = styles["table_cell"] cell_style = styles["table_cell"]
total_cells = len(rows) * len(headers)
log_interval = max(1, total_cells // 20) # Log every 5% progress
for row_idx, row_data in enumerate(rows): for row_idx, row_data in enumerate(rows):
if row_idx + 1 < len(table.rows): if row_idx + 1 < len(table.rows):
table_row = table.rows[row_idx + 1] table_row = table.rows[row_idx + 1]
@ -435,16 +476,30 @@ class RendererDocx(BaseRenderer):
cell = table_row.cells[col_idx] cell = table_row.cells[col_idx]
cell.text = str(cell_data) cell.text = str(cell_data)
# Apply text styling # Apply text styling - OPTIMIZED: Only style if needed
for paragraph in cell.paragraphs: # For large tables, styling every cell can be very slow
paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT # Check if we need to apply styling (only if style differs from default)
for run in paragraph.runs: if cell_style.get("text_color") != "#2F2F2F" or cell_style.get("font_size") != 10:
run.font.size = Pt(10) for paragraph in cell.paragraphs:
text_color = cell_style["text_color"].lstrip('#') paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16)) for run in paragraph.runs:
run.font.size = Pt(10)
text_color = cell_style["text_color"].lstrip('#')
run.font.color.rgb = RGBColor(int(text_color[0:2], 16), int(text_color[2:4], 16), int(text_color[4:6], 16))
# Log progress for large tables
if (row_idx + 1) % log_interval == 0 or row_idx == len(rows) - 1:
elapsed = time.time() - rows_start
progress = ((row_idx + 1) / len(rows)) * 100
cells_processed = (row_idx + 1) * len(headers)
rate = cells_processed / elapsed if elapsed > 0 else 0
self.logger.debug(f"_renderJsonTable: Progress {progress:.1f}% ({row_idx + 1}/{len(rows)} rows, {cells_processed}/{total_cells} cells) - Rate: {rate:.1f} cells/s, Elapsed: {elapsed:.2f}s")
total_time = time.time() - table_start
self.logger.info(f"_renderJsonTable: Table rendering completed in {total_time:.2f}s ({len(rows)} rows × {len(headers)} cols = {total_cells} cells)")
except Exception as e: except Exception as e:
self.logger.warning(f"Error rendering table: {str(e)}") self.logger.error(f"Error rendering table: {str(e)}", exc_info=True)
def _applyHorizontalBordersOnly(self, table) -> None: def _applyHorizontalBordersOnly(self, table) -> None:
"""Apply only horizontal borders to the table (no vertical borders).""" """Apply only horizontal borders to the table (no vertical borders)."""

View file

@ -1300,16 +1300,16 @@ def _extractLastCompleteAndIncomplete(jsonContent: str) -> Tuple[str, str]:
lastCompleteElement = _findLastCompleteElement(lastCompletePart) lastCompleteElement = _findLastCompleteElement(lastCompletePart)
if lastCompleteElement: if lastCompleteElement:
# Build context for incomplete part - show structure around the break # Build context for incomplete part - show structure around the break
incompleteWithContext = _buildIncompleteContext(jsonContent, lastCompleteEnd) incompleteWithContext = buildIncompleteContext(jsonContent, lastCompleteEnd)
return lastCompleteElement, incompleteWithContext return lastCompleteElement, incompleteWithContext
else: else:
# Build context for incomplete part # Build context for incomplete part
incompleteWithContext = _buildIncompleteContext(jsonContent, lastCompleteEnd) incompleteWithContext = buildIncompleteContext(jsonContent, lastCompleteEnd)
return lastCompletePart, incompleteWithContext return lastCompletePart, incompleteWithContext
else: else:
# No complete structure found - everything is incomplete # No complete structure found - everything is incomplete
# Still try to show context # Still try to show context
incompleteWithContext = _buildIncompleteContext(jsonContent, 0) incompleteWithContext = buildIncompleteContext(jsonContent, 0)
return "", incompleteWithContext return "", incompleteWithContext
@ -1359,183 +1359,449 @@ def _findLastCompleteElement(jsonStr: str) -> str:
return "" return ""
def _buildIncompleteContext(jsonContent: str, breakPosition: int) -> str: def buildIncompleteContext(jsonContent: str, breakPosition: int) -> str:
""" """
Build intelligent context showing the incomplete element with its parent structure hierarchy. Build hierarchical context showing incomplete JSON structure.
Logic (as per user instruction): Shows:
1. Cut piece level: element of a list (the incomplete element at cut point) - Full hierarchy structure (always shown)
2. Parent of the cut element: the list/array containing the cut piece (with cut point shown) - Complete elements before cut (within 200 char DATA budget)
3. Last complete object on the same level like the cut object (if exists) PLUS further previous - Cut piece marked with <-- CUT POINT (incomplete)
content from the json string (maximum 1000 characters) - Does NOT close open structures
4. Next parent levels, until root. Further 1000 characters to show content (but only complete
objects - if too big, not to show), then only showing metadata until root
Example output structure:
{
"elements": [
{
"content": {
"rows": [
[37847, 37853, 37861, 37871, 37879, 37889, 37897, 37907, 37951, 37957],
[37957, 37963, 37967, 37987, 37991, <-- CUT POINT (incomplete)
""" """
import json if breakPosition <= 0 or breakPosition > len(jsonContent):
import re return jsonContent
if breakPosition <= 0 or breakPosition >= len(jsonContent):
# Invalid break position - show last 500 chars
return jsonContent[-500:] if len(jsonContent) > 500 else jsonContent
contextParts = []
# Find structure hierarchy backwards from break point
hierarchy = _findStructureHierarchy(jsonContent, breakPosition)
hierarchy = findStructureHierarchy(jsonContent, breakPosition)
if not hierarchy: if not hierarchy:
# Fallback: show simple context return jsonContent[:breakPosition]
contextParts.append("Cut point context:\n")
contextStart = max(0, breakPosition - 500)
contextParts.append(jsonContent[contextStart:breakPosition + 100])
return "\n".join(contextParts)
# Step 1: Extract cut piece (incomplete element at cut point)
cutPiece = _extractCutPiece(jsonContent, breakPosition)
# Step 2: Find the cut level (the array/object containing the cut piece)
cutLevel = hierarchy[-1] if hierarchy else None
if not cutLevel:
# Fallback
contextParts.append("Cut point context:\n")
contextStart = max(0, breakPosition - 500)
contextParts.append(jsonContent[contextStart:breakPosition + 100])
return "\n".join(contextParts)
# Build context following the exact structure requested
# Show hierarchical structure from root to cut point
# Extract the actual JSON structure from root to cut point
# Build the full hierarchical structure showing:
# 4. Parent levels until root (with content/metadata limits)
# 3. Last complete elements on same level + previous content (max 1000 chars)
# 2. Parent container (the list) with cut piece
# 1. Cut piece
cutPiece = extractCutPiece(jsonContent, breakPosition)
resultLines = [] resultLines = []
DATA_BUDGET = 500
# Build structure from root to cut level # Build hierarchy level by level - show actual JSON structure
# Extract actual JSON content for each level for levelIndex, level in enumerate(hierarchy):
for i, level in enumerate(hierarchy):
levelType = level['type'] levelType = level['type']
start = level['start_pos'] levelStart = level['start_pos']
end = level['end_pos'] if i < len(hierarchy) - 1 else breakPosition levelDepth = level['depth']
key = level.get('key') indent = " " * levelDepth
depth = level['depth'] isCutLevel = (levelIndex == len(hierarchy) - 1)
isParentOfCutLevel = (levelIndex == len(hierarchy) - 2)
indent = " " * depth # Get next level info
if levelIndex < len(hierarchy) - 1:
if i < len(hierarchy) - 1: nextLevel = hierarchy[levelIndex + 1]
# Parent levels - show opening structure nextLevelStart = nextLevel['start_pos']
levelContent = jsonContent[start:end]
# If content is too large, show only metadata
if len(levelContent) > 1000:
# Show opening with key
opening = jsonContent[start:min(start + 100, end)]
if key:
resultLines.append(f'{indent}"{key}": {{')
else:
resultLines.append(f'{indent}{{')
resultLines.append(f'{indent} ...')
else:
# Show opening structure
if key:
# Find where the key's value starts
keyEnd = jsonContent.find(':', start)
if keyEnd > 0:
opening = jsonContent[start:min(keyEnd + 50, end)]
resultLines.append(f'{indent}{opening}')
else:
opening = jsonContent[start:min(start + 50, end)]
resultLines.append(f'{indent}{opening}')
else: else:
# Cut level - show detailed context nextLevelStart = breakPosition
cutLevelType = levelType
cutLevelStart = start
cutLevelKey = key
cutLevelDepth = depth
# Show key if available # Show opening structure for this level
if cutLevelKey: resultLines.append(f'{indent}{{' if levelType == 'object' else f'{indent}[')
resultLines.append(f'{indent}"{cutLevelKey}": {{') childIndent = indent + " "
indent += " "
if cutLevelType == 'array': if isCutLevel:
# Show array opening # Cut level: show cut piece
arrayKey = _findKeyBefore(jsonContent, cutLevelStart) if cutPiece:
if arrayKey: for line in cutPiece.split('\n'):
resultLines.append(f'{indent}"{arrayKey}": [') stripped = line.strip()
else: if stripped:
resultLines.append(f'{indent}[') resultLines.append(f'{childIndent}{stripped}')
indent += " " resultLines[-1] += ' <-- CUT POINT (incomplete)'
# 3. Show last complete elements on same level + previous content (max 1000 chars)
contentBeforeBreak = jsonContent[cutLevelStart:breakPosition]
lastCompleteElements = _extractLastCompleteArrayElementsWithContext(
contentBeforeBreak, jsonContent, cutLevelStart, maxChars=1000
)
if lastCompleteElements:
resultLines.append(lastCompleteElements)
# 2. Show parent container (the list) with cut piece
cutArrayElement = _findCutArrayElement(jsonContent, breakPosition, cutLevelStart)
if cutArrayElement:
resultLines.append(f'{indent}{cutArrayElement} <-- CUT POINT (incomplete)')
else:
# Fallback: show what we have at break point
cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
resultLines.append(f'{indent}{cutPart} <-- CUT POINT (incomplete)')
# Close the array
indent = indent[:-2] if len(indent) >= 2 else indent
resultLines.append(f'{indent}]')
else: else:
# Object at cut level resultLines.append(f'{childIndent}... <-- CUT POINT (incomplete)')
cutPart = jsonContent[breakPosition:breakPosition + 200].strip()
preview = jsonContent[cutLevelStart:breakPosition]
preview = preview[-500:] if len(preview) > 500 else preview
resultLines.append(f'{indent}{preview}... {cutPart} <-- CUT POINT (incomplete)')
# Close all parent structures elif isParentOfCutLevel and levelType == 'array':
for i in range(len(hierarchy) - 2, -1, -1): # Parent of cut level: show complete elements with budget
level = hierarchy[i] completeElements = _findCompleteElementsAtLevel(
depth = level['depth'] jsonContent, levelStart, nextLevelStart, levelDepth
indent = " " * depth )
resultLines.append(f'{indent}}}')
contextParts.append("\n".join(resultLines)) print(f"DEBUG: Found {len(completeElements)} complete elements")
print(f"DEBUG: Budget = {DATA_BUDGET}")
return "\n".join(contextParts) dataBudget = DATA_BUDGET
for elementStart, elementEnd in reversed(completeElements):
elementData = jsonContent[elementStart:elementEnd].strip()
elementSize = len(elementData)
print(f"DEBUG: Element size = {elementSize}, remaining budget = {dataBudget}")
if elementSize == 0:
continue
if elementSize > dataBudget:
print(f"DEBUG: Element too large, stopping")
break
print(f"DEBUG: Adding element (size {elementSize})")
for line in elementData.split('\n'):
stripped = line.strip()
if stripped:
resultLines.append(f'{childIndent}{stripped}')
if elementEnd < nextLevelStart:
resultLines[-1] += ','
dataBudget -= elementSize
print(f"DEBUG: Budget after decrement = {dataBudget}")
if dataBudget <= 0:
print(f"DEBUG: Budget exhausted, stopping")
break
else:
# Other parent levels: show path content (keys and values) leading to next level
pathContent = jsonContent[levelStart + 1:nextLevelStart].strip()
if pathContent:
# Show all path content (structure is always shown, not truncated)
for line in pathContent.split('\n'):
stripped = line.strip()
if stripped:
resultLines.append(f'{childIndent}{stripped}')
return "\n".join(resultLines)
def _extractCutPiece(jsonContent: str, breakPosition: int) -> str: def _buildNestedHierarchy(
"""Extract the incomplete piece at the cut point.""" resultLines: List[str],
# Get characters after break point (incomplete part) jsonContent: str,
afterBreak = jsonContent[breakPosition:breakPosition + 200].strip() hierarchy: List[Dict[str, Any]],
# Find where the incomplete piece ends (next comma, bracket, brace, or end) levelIndex: int,
for i, char in enumerate(afterBreak): breakPosition: int,
if char in [',', ']', '}', '\n']: cutPiece: str,
return afterBreak[:i].strip() cutLevel: Dict[str, Any]
return afterBreak[:50].strip() # Limit to 50 chars if no delimiter found ) -> None:
"""
Recursively build nested hierarchy from root to cut level.
This ensures proper nesting where each level contains the next level.
"""
if levelIndex >= len(hierarchy):
return
level = hierarchy[levelIndex]
levelType = level['type']
levelStart = level['start_pos']
levelKey = level.get('key')
levelDepth = level['depth']
indent = " " * levelDepth
isCutLevel = (levelIndex == len(hierarchy) - 1)
# Show opening structure for this level
if levelKey:
resultLines.append(f'{indent}"{levelKey}": {{' if levelType == 'object' else f'{indent}"{levelKey}": [')
else:
resultLines.append(f'{indent}{{' if levelType == 'object' else f'{indent}[')
childIndent = indent + " "
if isCutLevel:
# Cut level - show content (complete elements + cut piece)
if levelType == 'array':
charBudget = 1000
completeElements = _findCompleteElementsAtLevel(
jsonContent, levelStart, breakPosition, levelDepth
)
# Show complete elements (working backwards from the cut)
for elementStart, elementEnd in reversed(completeElements):
elementSize = elementEnd - elementStart
if charBudget >= elementSize:
element = jsonContent[elementStart:elementEnd].strip()
if element:
elementLines = element.split('\n')
for line in elementLines:
if line.strip():
resultLines.append(f'{childIndent}{line}')
if elementEnd < breakPosition:
resultLines[-1] += ','
charBudget -= elementSize
else:
break
# Show cut piece
if cutPiece:
cutPieceLines = cutPiece.split('\n')
for line in cutPieceLines:
if line.strip():
resultLines.append(f'{childIndent}{line}')
resultLines[-1] += ' <-- CUT POINT (incomplete)'
else:
cutPart = jsonContent[max(0, breakPosition-50):breakPosition]
resultLines.append(f'{childIndent}{cutPart} <-- CUT POINT (incomplete)')
else:
# Object at cut level
previewSize = breakPosition - levelStart
maxPreviewSize = 500
if previewSize > maxPreviewSize:
previewStart = breakPosition - maxPreviewSize
preview = jsonContent[previewStart:breakPosition]
else:
preview = jsonContent[levelStart:breakPosition]
previewLines = preview.split('\n')
for line in previewLines:
if line.strip():
resultLines.append(f'{childIndent}{line}')
cutPart = jsonContent[breakPosition:min(breakPosition + 50, len(jsonContent))]
resultLines.append(f'{childIndent}... {cutPart} <-- CUT POINT (incomplete)')
else:
# Parent level - show path to next level, then recursively build next level
nextLevel = hierarchy[levelIndex + 1]
nextLevelKey = nextLevel.get('key')
nextLevelStart = nextLevel['start_pos']
nextLevelType = nextLevel['type']
# Extract content between this level's opening and next level's start
# This shows any keys/values that come before the next level
pathContent = jsonContent[levelStart + 1:nextLevelStart].strip()
# Show the path content (keys/values before next level)
if len(pathContent) > 0 and len(pathContent) <= 500:
pathLines = pathContent.split('\n')
nonEmptyLines = [line for line in pathLines if line.strip()]
if nonEmptyLines:
for line in nonEmptyLines[:20]: # Show more lines
if line.strip():
resultLines.append(f'{childIndent}{line}')
if len(nonEmptyLines) > 20:
resultLines.append(f'{childIndent}... ({len(nonEmptyLines) - 20} more lines) ...')
elif len(pathContent) > 500:
# Content too large - show placeholder
resultLines.append(f'{childIndent}... (content too large, {len(pathContent)} chars) ...')
# Always show the key leading to next level if it exists
# The recursive call will show the opening bracket/brace, so we just show the key here
if nextLevelKey:
# Show the key (the recursive call will add the opening bracket/brace)
# Actually, the recursive call already shows the full opening with key,
# so we don't need to show it here - just let the recursive call handle it
pass
# Recursively build next level (this will show its opening structure and content)
_buildNestedHierarchy(resultLines, jsonContent, hierarchy, levelIndex + 1, breakPosition, cutPiece, cutLevel)
# Close this level
resultLines.append(f'{indent}}}' if levelType == 'object' else f'{indent}]')
def _findStructureHierarchy(jsonContent: str, breakPosition: int) -> List[Dict[str, Any]]: def _findCompleteElementsAtLevel(
jsonContent: str,
levelStart: int,
breakPosition: int,
targetDepth: int
) -> List[Tuple[int, int]]:
"""
Find all complete elements at a specific depth level.
Elements inside the structure at targetDepth are at targetDepth + 1.
We track depth relative to the start of the structure.
Returns list of (start, end) tuples for complete elements.
"""
completeElements = []
# Track depth relative to the level start
# When we're at levelStart, we're at the opening bracket/brace (depth = targetDepth)
# Elements inside are at depth = targetDepth + 1
relativeDepth = 0 # Depth relative to level start (0 = at opening bracket/brace)
inString = False
escapeNext = False
currentElementStart = None
# Find the first non-whitespace character after the opening bracket/brace
for i in range(levelStart + 1, min(breakPosition, len(jsonContent))):
if jsonContent[i] not in [' ', '\n', '\r', '\t']:
currentElementStart = i
break
if currentElementStart is None:
return completeElements
for i in range(currentElementStart, min(breakPosition, len(jsonContent))):
char = jsonContent[i]
if escapeNext:
escapeNext = False
continue
if char == '\\':
escapeNext = True
continue
if char == '"':
inString = not inString
continue
if not inString:
if char == '{':
relativeDepth += 1
elif char == '}':
relativeDepth -= 1
# Element is complete when we return to the level's depth (relativeDepth == 0)
if relativeDepth == 0:
# Found end of complete element
if currentElementStart is not None:
completeElements.append((currentElementStart, i + 1))
# Find start of next element
j = i + 1
while j < breakPosition and j < len(jsonContent) and jsonContent[j] in [' ', '\n', '\r', '\t', ',']:
j += 1
if j < breakPosition:
currentElementStart = j
else:
currentElementStart = None
elif char == '[':
relativeDepth += 1
elif char == ']':
relativeDepth -= 1
# Element is complete when we return to the level's depth (relativeDepth == 0)
if relativeDepth == 0:
# Found end of complete element
if currentElementStart is not None:
completeElements.append((currentElementStart, i + 1))
# Find start of next element
j = i + 1
while j < breakPosition and j < len(jsonContent) and jsonContent[j] in [' ', '\n', '\r', '\t', ',']:
j += 1
if j < breakPosition:
currentElementStart = j
else:
currentElementStart = None
elif char == ',':
# Comma at relativeDepth == 0 means we're between elements at the cut level
if relativeDepth == 0:
# Element boundary - check if we have a complete element
if currentElementStart is not None and currentElementStart < i:
# Simple value (string, number, boolean, null) - complete at comma
completeElements.append((currentElementStart, i))
# Find start of next element
j = i + 1
while j < breakPosition and j < len(jsonContent) and jsonContent[j] in [' ', '\n', '\r', '\t']:
j += 1
if j < breakPosition:
currentElementStart = j
else:
currentElementStart = None
return completeElements
def extractCutPiece(jsonContent: str, breakPosition: int) -> str:
"""
Extract the incomplete piece at the cut point.
Generic function that works with all JSON types: arrays, objects, strings, numbers, booleans, null.
CRITICAL: Uses findStructureHierarchy to identify the cut level, then parses from the cut level start
to correctly identify which element contains the break position.
This approach handles all JSON structures generically, including:
- Nested objects and arrays
- Strings containing brackets, braces, commas
- Complex nested structures
Returns the incomplete element from its start to the break position.
"""
if breakPosition <= 0 or breakPosition > len(jsonContent):
return ""
# First, find the structure hierarchy to identify the cut level
hierarchy = findStructureHierarchy(jsonContent, breakPosition)
if not hierarchy:
# Fallback: return content before break
return jsonContent[max(0, breakPosition - 200):breakPosition].lstrip()
# Get the cut level (the structure containing the break position)
cutLevel = hierarchy[-1]
cutLevelStart = cutLevel['start_pos']
cutLevelDepth = cutLevel.get('depth', 0)
# Parse from cutLevelStart to breakPosition to find element boundaries
braceDepth = 0 # Absolute brace depth
bracketDepth = 0 # Absolute bracket depth
inString = False
escapeNext = False
# Track element start at the cut level
currentElementStart = cutLevelStart # Start of current element
# Parse from cut level start to break position
for i in range(cutLevelStart, min(breakPosition, len(jsonContent))):
char = jsonContent[i]
if escapeNext:
escapeNext = False
continue
if char == '\\':
escapeNext = True
continue
if char == '"':
inString = not inString
continue
if not inString:
if char == '{':
braceDepth += 1
elif char == '}':
braceDepth -= 1
elif char == '[':
bracketDepth += 1
elif char == ']':
bracketDepth -= 1
elif char == ',':
# Comma at cut level separates elements
currentDepth = braceDepth + bracketDepth
if currentDepth == cutLevelDepth:
# This comma is at the cut level - next element starts after it
j = i + 1
while j < breakPosition and j < len(jsonContent) and jsonContent[j] in [' ', '\n', '\r', '\t']:
j += 1
if j < breakPosition:
currentElementStart = j
elif char == ':':
# Colon at cut level separates key from value
currentDepth = braceDepth + bracketDepth
if currentDepth == cutLevelDepth:
# This colon is at the cut level - value starts after it
j = i + 1
while j < breakPosition and j < len(jsonContent) and jsonContent[j] in [' ', '\n', '\r', '\t']:
j += 1
if j < breakPosition:
currentElementStart = j
# The element containing breakPosition starts at currentElementStart
# Find the actual start by skipping leading whitespace
actualStart = currentElementStart
for i in range(currentElementStart, min(breakPosition, len(jsonContent))):
char = jsonContent[i]
if char not in [' ', '\n', '\r', '\t']:
actualStart = i
break
# Extract the incomplete piece from actualStart to breakPosition
# Preserve trailing whitespace - it's needed for merging
cutPiece = jsonContent[actualStart:breakPosition]
# Remove leading whitespace but preserve trailing whitespace
cutPiece = cutPiece.lstrip()
return cutPiece if cutPiece else jsonContent[actualStart:breakPosition]
def findStructureHierarchy(jsonContent: str, breakPosition: int) -> List[Dict[str, Any]]:
""" """
Find the structure hierarchy backwards from break point to root. Find the structure hierarchy backwards from break point to root.
Returns list of level info dicts, from root to cut level. Returns list of level info dicts, from root to cut level.
Each level has: type, start_pos, end_pos, parent_start, content_preview Each level has: type, start_pos, end_pos, depth, key
CRITICAL: Returns the path from root to cut point.
- For closed structures: uses actual end position
- For open structures: uses breakPosition
""" """
hierarchy = [] hierarchy = []
@ -1545,8 +1811,11 @@ def _findStructureHierarchy(jsonContent: str, breakPosition: int) -> List[Dict[s
inString = False inString = False
escapeNext = False escapeNext = False
# Find all structure boundaries before break point # Track ALL structures (both closed and open) to get correct end positions
structureStack = [] # Stack of (type, start_pos, depth) # Stack of (type, start_pos, depth, end_pos)
# end_pos is None until structure is closed
structureStack = [] # Stack of (type, start_pos, depth, end_pos)
closedStructures = [] # List of closed structures with their end positions
for i in range(breakPosition): for i in range(breakPosition):
if i >= len(jsonContent): if i >= len(jsonContent):
@ -1568,52 +1837,179 @@ def _findStructureHierarchy(jsonContent: str, breakPosition: int) -> List[Dict[s
if not inString: if not inString:
if char == '{': if char == '{':
structureStack.append(('object', i, braceDepth + bracketDepth)) # Store depth BEFORE incrementing (this is the level of the structure being opened)
currentDepth = braceDepth + bracketDepth
structureStack.append(('object', i, currentDepth, None))
braceDepth += 1 braceDepth += 1
elif char == '}': elif char == '}':
# When closing, record the end position and move to closed structures
if structureStack and structureStack[-1][0] == 'object': if structureStack and structureStack[-1][0] == 'object':
_, start, depth = structureStack.pop() structType, start, depth, _ = structureStack.pop()
hierarchy.append({ closedStructures.append({
'type': 'object', 'type': structType,
'start_pos': start, 'start_pos': start,
'end_pos': i + 1, 'end_pos': i + 1, # Actual end position
'depth': depth, 'depth': depth,
'key': _findKeyBefore(jsonContent, start) 'key': findKeyBefore(jsonContent, start)
}) })
braceDepth -= 1 braceDepth -= 1
elif char == '[': elif char == '[':
structureStack.append(('array', i, braceDepth + bracketDepth)) # Store depth BEFORE incrementing
currentDepth = braceDepth + bracketDepth
structureStack.append(('array', i, currentDepth, None))
bracketDepth += 1 bracketDepth += 1
elif char == ']': elif char == ']':
# When closing, record the end position
if structureStack and structureStack[-1][0] == 'array': if structureStack and structureStack[-1][0] == 'array':
_, start, depth = structureStack.pop() structType, start, depth, _ = structureStack.pop()
hierarchy.append({ closedStructures.append({
'type': 'array', 'type': structType,
'start_pos': start, 'start_pos': start,
'end_pos': i + 1, 'end_pos': i + 1, # Actual end position
'depth': depth, 'depth': depth,
'key': _findKeyBefore(jsonContent, start) 'key': findKeyBefore(jsonContent, start)
}) })
bracketDepth -= 1 bracketDepth -= 1
# Sort by depth (root first) and filter to get hierarchy from root to cut # Build hierarchy: we need the actual path from root to cut level
hierarchy.sort(key=lambda x: x['depth']) # CRITICAL: Only include structures that are actually on the path
# A structure is on the path if it contains the next level's start position
# Find which level contains the break point if not structureStack:
cutLevelIndex = -1 # No open structures - all were closed before break
for i, level in enumerate(hierarchy): # Return path to deepest closed structure
if level['start_pos'] < breakPosition <= level['end_pos']: if closedStructures:
cutLevelIndex = i maxDepth = max(s['depth'] for s in closedStructures)
# Build path: each level must contain the next level
path = []
for depth in range(maxDepth + 1):
candidates = [s for s in closedStructures if s['depth'] == depth]
if candidates:
# If multiple at same depth, use the one that contains structures at deeper depths
if depth < maxDepth:
# Find the one that contains a structure at depth + 1
nextDepthCandidates = [s for s in closedStructures if s['depth'] == depth + 1]
if nextDepthCandidates:
nextStart = min(s['start_pos'] for s in nextDepthCandidates)
# Find candidate that contains nextStart
for candidate in candidates:
if candidate['start_pos'] < nextStart < candidate['end_pos']:
path.append(candidate)
break
else:
# Fallback: use first candidate
path.append(candidates[0])
else:
path.append(candidates[0])
else:
path.append(candidates[0])
return path
return []
# We have open structures - build path from root to deepest open structure
# Strategy: Start from deepest open structure and work backwards to root,
# ensuring each level contains the next level
openByDepth = {}
for structType, start, depth, _ in structureStack:
openByDepth[depth] = {
'type': structType,
'start_pos': start,
'end_pos': breakPosition,
'depth': depth,
'key': findKeyBefore(jsonContent, start)
}
maxOpenDepth = max(openByDepth.keys())
# Build path backwards from deepest to root
path = []
currentDepth = maxOpenDepth
currentStart = openByDepth[maxOpenDepth]['start_pos']
while currentDepth >= 0:
# Look for structure at currentDepth that contains currentStart
# First check open structures
if currentDepth in openByDepth:
struct = openByDepth[currentDepth]
if struct['start_pos'] <= currentStart:
path.insert(0, struct)
currentStart = struct['start_pos']
currentDepth -= 1
continue
# Check closed structures
candidates = [s for s in closedStructures if s['depth'] == currentDepth and s['start_pos'] <= currentStart < s['end_pos']]
if candidates:
# Use the one that ends latest (most recent)
struct = max(candidates, key=lambda x: x['end_pos'])
path.insert(0, struct)
currentStart = struct['start_pos']
currentDepth -= 1
else:
# No structure found at this depth - break
break break
if cutLevelIndex >= 0: return path
# Return hierarchy from root to cut level
return hierarchy[:cutLevelIndex + 1] # Return the hierarchy (path from root to cut level)
if hierarchy:
return hierarchy
# Fallback: if JSON starts with { or [, create a root level
if jsonContent and jsonContent.strip():
firstChar = jsonContent.strip()[0]
if firstChar == '{':
return [{
'type': 'object',
'start_pos': 0,
'end_pos': breakPosition,
'depth': 0,
'key': None
}]
elif firstChar == '[':
return [{
'type': 'array',
'start_pos': 0,
'end_pos': breakPosition,
'depth': 0,
'key': None
}]
return [] return []
def _findKeyBefore(jsonContent: str, pos: int) -> Optional[str]: def extractOverlapContext(jsonContent: str, breakPosition: int) -> str:
"""
Extract overlap context: the object containing the cut element.
Returns ONLY the object containing the cut element (the incomplete element itself).
This is what the continuation should start with for proper merging.
CRITICAL: Preserves trailing whitespace for proper merging.
Args:
jsonContent: The incomplete JSON string
breakPosition: Position where JSON was cut
Returns:
String with the object containing the cut element
"""
if not jsonContent or breakPosition <= 0:
return jsonContent[-200:].strip() if jsonContent else ""
# Extract cut piece (incomplete element) - this is the object containing the cut element
cutPiece = extractCutPiece(jsonContent, breakPosition)
# Return only the cut piece - the object containing the cut element
if cutPiece:
return cutPiece
# Fallback: show content before break
return jsonContent[max(0, breakPosition - 200):breakPosition].lstrip()
def findKeyBefore(jsonContent: str, pos: int) -> Optional[str]:
"""Find the key name before a structure start position.""" """Find the key name before a structure start position."""
# Look backwards for "key": pattern # Look backwards for "key": pattern
before = jsonContent[max(0, pos - 100):pos] before = jsonContent[max(0, pos - 100):pos]
@ -1832,10 +2228,13 @@ def _extractLastCompleteArrayElementsWithContext(
break break
if formattedElements: if formattedElements:
# Format as JSON array rows # Format as JSON array rows (without hardcoded indentation - caller will add it)
result = [] result = []
for elem in formattedElements: for elem in formattedElements:
result.append(f" {elem},") # Remove leading comma if present (from mid-element extraction)
cleanElem = elem.lstrip(',').strip()
if cleanElem:
result.append(f"{cleanElem},")
return "\n".join(result) return "\n".join(result)
return "" return ""

View file

@ -0,0 +1,216 @@
# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
Test function to verify structure hierarchy and overlap context generation.
Tests the functions used to generate continuation prompts for incomplete JSON.
"""
import json
import os
from pathlib import Path
def testOverlapContext():
"""
Test function that loads two JSON parts and returns:
1. Structure hierarchy result
2. Overlap requirement context result
"""
# Load the JSON file (incomplete/cut JSON)
basePath = Path(__file__).parent.parent.parent / "local" / "debug" / "prompts"
file1Path = basePath / "20260104-220716-032-chapter_2_section_section_2_response.txt"
# Read JSON (incomplete)
with open(file1Path, 'r', encoding='utf-8') as f:
json1Content = f.read().strip()
# Find the break position in json1 (where it was cut)
# The last line in json1 is incomplete: [37963, 37967, 37987, 37991, 37993, 37997, 38011, 38039
# We need to find where this incomplete array element ends (right after the last number)
# Find the last number in the file - that's where the content actually ends
import re
# Find all numbers at the end and get the position of the last one
# Look for the pattern: number followed by whitespace/newline or end of string
matches = list(re.finditer(r'\d+', json1Content))
if matches:
lastMatch = matches[-1]
# Break position is right after the last number (where the closing ] should be)
breakPosition = lastMatch.end()
else:
# Fallback: use end of file
breakPosition = len(json1Content.rstrip())
print(f"Break position determined: {breakPosition}")
print(f"Content at break position: '{json1Content[max(0, breakPosition-50):breakPosition+10]}'")
# Import the functions we need to test
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from modules.shared.jsonUtils import findStructureHierarchy, extractCutPiece, buildIncompleteContext
from modules.services.serviceGeneration.paths.codePath import CodeGenerationPath
# Test 1: Find structure hierarchy
print("=" * 80)
print("TEST 1: Structure Hierarchy")
print("=" * 80)
print(f"Break position: {breakPosition}")
print(f"JSON length: {len(json1Content)}")
print(f"Content around break: '{json1Content[max(0, breakPosition-100):breakPosition+20]}'")
hierarchy = findStructureHierarchy(json1Content, breakPosition)
print(f"\nHierarchy levels found: {len(hierarchy) if hierarchy else 0}")
if not hierarchy:
print("WARNING: No hierarchy found! This suggests the function isn't working correctly.")
else:
print("\nHierarchy details (from root to cut level):")
for i, level in enumerate(hierarchy):
levelType = level['type']
levelKey = level.get('key', 'N/A')
levelDepth = level['depth']
levelStart = level['start_pos']
levelEnd = level['end_pos']
print(f" Level {i}: {levelType:6s} depth={levelDepth} key='{levelKey}' start={levelStart} end={levelEnd}")
# Show a snippet of content at this level
if levelStart < len(json1Content):
snippet = json1Content[levelStart:min(levelStart + 50, levelEnd, len(json1Content))]
print(f" Content: {repr(snippet)}")
# Test 2: Extract cut piece
print("\n" + "=" * 80)
print("TEST 2: Extract Cut Piece")
print("=" * 80)
cutPiece = extractCutPiece(json1Content, breakPosition)
print(f"\nCut piece extracted (length: {len(cutPiece)}):")
if cutPiece:
print(cutPiece[:500] if len(cutPiece) > 500 else cutPiece)
else:
print("WARNING: Cut piece is empty! This suggests the function isn't working correctly.")
# Try to manually find the cut piece
# Look backwards from break position for the start of the incomplete array
i = breakPosition - 1
while i >= 0 and json1Content[i] not in ['[', ',', '\n']:
i -= 1
if i >= 0 and json1Content[i] == '[':
manualCutPiece = json1Content[i:breakPosition]
print(f"\nManually found cut piece: {manualCutPiece[:200]}")
# Test 3: Build incomplete context (structure hierarchy with cut point)
print("\n" + "=" * 80)
print("TEST 3: Build Incomplete Context (Structure Hierarchy with Cut Point)")
print("=" * 80)
print("Expected: Should show complete hierarchy from root to cut point")
print(" with complete elements before cut and cut piece marked")
incompleteContext = buildIncompleteContext(json1Content, breakPosition)
print(f"\nIncomplete context (length: {len(incompleteContext)} chars):")
print("-" * 80)
print(incompleteContext)
print("-" * 80)
# Validate the output
if incompleteContext:
# Check if it shows hierarchy (should have multiple levels of indentation)
lines = incompleteContext.split('\n')
indentLevels = set()
for line in lines:
if line.strip():
indent = len(line) - len(line.lstrip())
indentLevels.add(indent)
print(f"\nValidation: Found {len(indentLevels)} different indent levels (should be > 1 for hierarchy)")
# Check if cut point is marked
if "<-- CUT POINT" in incompleteContext:
print("Validation: Cut point marker found ✓")
else:
print("Validation: WARNING - Cut point marker NOT found!")
# Check if root structure is shown
if incompleteContext.strip().startswith('{') or incompleteContext.strip().startswith('['):
print("Validation: Root structure opening found ✓")
else:
print("Validation: WARNING - Root structure opening NOT found!")
else:
print("WARNING: Incomplete context is empty!")
# Test 4: Extract overlap context (cut part and full part before same level)
print("\n" + "=" * 80)
print("TEST 4: Extract Overlap Context (Cut Part + Full Part Before Same Level)")
print("=" * 80)
overlapContext = CodeGenerationPath._extractOverlapContext(json1Content, breakPosition)
print(f"\nOverlap context:")
print(overlapContext)
# Return results as dictionary
results = {
"hierarchy": hierarchy,
"cutPiece": cutPiece,
"incompleteContext": incompleteContext,
"overlapContext": overlapContext,
"breakPosition": breakPosition,
"json1Length": len(json1Content),
"json1Content": json1Content
}
return results
if __name__ == "__main__":
print("Testing Overlap Context Generation")
print("=" * 80)
results = testOverlapContext()
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"\nBreak position: {results['breakPosition']}")
print(f"JSON1 length: {results['json1Length']}")
print(f"Hierarchy levels: {len(results['hierarchy']) if results['hierarchy'] else 0}")
print(f"Cut piece length: {len(results['cutPiece'])}")
print(f"Incomplete context length: {len(results['incompleteContext'])}")
print(f"Overlap context length: {len(results['overlapContext'])}")
# Save results to file for inspection
outputPath = Path(__file__).parent.parent.parent / "local" / "debug" / "test_overlap_results.txt"
outputPath.parent.mkdir(parents=True, exist_ok=True)
with open(outputPath, 'w', encoding='utf-8') as f:
f.write("=" * 80 + "\n")
f.write("OVERLAP CONTEXT TEST RESULTS\n")
f.write("=" * 80 + "\n\n")
f.write("FIRST JSON (CUT/INCOMPLETE):\n")
f.write("-" * 80 + "\n")
f.write(f"Break position: {results['breakPosition']}\n")
f.write(f"JSON length: {results['json1Length']}\n")
json1Content = results['json1Content']
f.write(f"Content around break: '{json1Content[max(0, results['breakPosition']-100):results['breakPosition']+20]}'\n\n")
f.write("Full JSON1 content:\n")
f.write(json1Content)
f.write("\n\n" + "=" * 80 + "\n")
f.write("STRUCTURE HIERARCHY:\n")
f.write("-" * 80 + "\n")
if results['hierarchy']:
f.write(f"Hierarchy levels found: {len(results['hierarchy'])}\n\n")
f.write("Hierarchy details (from root to cut level):\n")
for i, level in enumerate(results['hierarchy']):
levelType = level['type']
levelKey = level.get('key', 'N/A')
levelDepth = level['depth']
levelStart = level['start_pos']
levelEnd = level['end_pos']
f.write(f" Level {i}: {levelType:6s} depth={levelDepth} key='{levelKey}' start={levelStart} end={levelEnd}\n")
else:
f.write("No hierarchy found\n")
f.write("\n\n" + "=" * 80 + "\n")
f.write("INCOMPLETE CONTEXT (Structure Hierarchy with Cut Point):\n")
f.write("-" * 80 + "\n")
f.write(results['incompleteContext'])
f.write("\n\n" + "=" * 80 + "\n")
f.write("OVERLAP CONTEXT (Object containing the cut element):\n")
f.write("-" * 80 + "\n")
f.write(results['overlapContext'])
print(f"\n\nFull results saved to: {outputPath}")