gateway/modules/shared/jsonContinuation.py

1232 lines
44 KiB
Python

"""
JSON Continuation Context Module
Generiert drei Kontexte für abgeschnittene JSON-Strings:
1. Overlap Context: Das innerste Objekt/Array-Element, das den Cut-Punkt enthält
2. Hierarchy Context: Die hierarchische Struktur vom Root bis zum Cut mit Budget-Logik
3. Complete Part: Der vollständige Teil des JSONs mit allen Strukturen geschlossen
Hauptfunktionen:
- extractContinuationContexts(truncatedJson: str) -> Tuple[str, str, str]
Extrahiert alle drei Kontexte aus einem abgeschnittenen JSON-String.
- getContexts(truncatedJson: str) -> JsonContinuationContexts
Gibt alle Kontexte als Pydantic-Modell zurück mit benannten Feldern.
Modulkonstanten:
- BUDGET_LIMIT: int = 500
Zeichen-Budget für vollständige Datenwerte im Hierarchy Context
- OVERLAP_MAX_CHARS: int = 1000
Maximale Zeichen für den Overlap Context
Verwendung:
>>> from modules.shared.jsonContinuation import getContexts
>>> jsonStr = '{"users": [{"name": "John", "bio": "Hello Wor'
>>> contexts = getContexts(jsonStr)
>>> print(contexts.overlapContext)
>>> print(contexts.hierarchyContext)
>>> print(contexts.completePart)
Autor: Claude
Version: 2.0
"""
from typing import Tuple, List, Optional, Any
from dataclasses import dataclass
from enum import Enum
from modules.datamodels.datamodelAi import JsonContinuationContexts
# =============================================================================
# MODULE CONSTANTS
# =============================================================================
BUDGET_LIMIT: int = 500
"""Zeichen-Budget für vollständige Datenwerte im Hierarchy Context"""
OVERLAP_MAX_CHARS: int = 1000
"""Maximale Zeichen für den Overlap Context"""
# =============================================================================
# TOKEN TYPES AND DATA CLASSES
# =============================================================================
class TokenType(Enum):
"""JSON Token Types"""
OBJECT_START = "{"
OBJECT_END = "}"
ARRAY_START = "["
ARRAY_END = "]"
STRING = "string"
NUMBER = "number"
BOOLEAN = "boolean"
NULL = "null"
COLON = ":"
COMMA = ","
KEY = "key"
EOF = "eof"
TRUNCATED = "truncated"
@dataclass
class Token:
"""Represents a JSON token with position info"""
type: TokenType
value: Any
start_pos: int
end_pos: int
raw: str # Original string representation
@dataclass
class StackFrame:
"""Represents a level in the JSON hierarchy"""
type: str # "object" or "array"
start_pos: int
key: Optional[str] = None # Current key for objects
index: int = 0 # Current index for arrays
content: str = "" # Accumulated content for this frame
keys_seen: List[str] = None # Keys seen in this object
def __post_init__(self):
if self.keys_seen is None:
self.keys_seen = []
class JsonTokenizer:
"""Tokenizer for potentially truncated JSON strings"""
def __init__(self, jsonStr: str):
self.jsonStr = jsonStr
self.pos = 0
self.length = len(jsonStr)
def skipWhitespace(self):
"""Skip whitespace characters"""
while self.pos < self.length and self.jsonStr[self.pos] in ' \t\n\r':
self.pos += 1
def peek(self) -> Optional[str]:
"""Peek at current character without consuming"""
if self.pos < self.length:
return self.jsonStr[self.pos]
return None
def readString(self) -> Token:
"""Read a JSON string token"""
start_pos = self.pos
self.pos += 1 # Skip opening quote
escaped = False
while self.pos < self.length:
char = self.jsonStr[self.pos]
if escaped:
escaped = False
self.pos += 1
elif char == '\\':
escaped = True
self.pos += 1
elif char == '"':
self.pos += 1
raw = self.jsonStr[start_pos:self.pos]
try:
# Try to decode the string value
value = raw[1:-1] # Remove quotes for value
except:
value = raw
return Token(TokenType.STRING, value, start_pos, self.pos, raw)
else:
self.pos += 1
# String was truncated
raw = self.jsonStr[start_pos:self.pos]
return Token(TokenType.TRUNCATED, raw[1:] if len(raw) > 1 else "", start_pos, self.pos, raw)
def readNumber(self) -> Token:
"""Read a JSON number token"""
start_pos = self.pos
# Handle negative
if self.pos < self.length and self.jsonStr[self.pos] == '-':
self.pos += 1
# Read digits
while self.pos < self.length and self.jsonStr[self.pos].isdigit():
self.pos += 1
# Decimal part
if self.pos < self.length and self.jsonStr[self.pos] == '.':
self.pos += 1
while self.pos < self.length and self.jsonStr[self.pos].isdigit():
self.pos += 1
# Exponent
if self.pos < self.length and self.jsonStr[self.pos] in 'eE':
self.pos += 1
if self.pos < self.length and self.jsonStr[self.pos] in '+-':
self.pos += 1
while self.pos < self.length and self.jsonStr[self.pos].isdigit():
self.pos += 1
raw = self.jsonStr[start_pos:self.pos]
try:
value = float(raw) if '.' in raw or 'e' in raw.lower() else int(raw)
except ValueError:
value = raw
return Token(TokenType.NUMBER, value, start_pos, self.pos, raw)
def readKeyword(self) -> Token:
"""Read true, false, or null"""
start_pos = self.pos
for keyword, token_type in [('true', TokenType.BOOLEAN),
('false', TokenType.BOOLEAN),
('null', TokenType.NULL)]:
if self.jsonStr[self.pos:].startswith(keyword):
self.pos += len(keyword)
value = True if keyword == 'true' else (False if keyword == 'false' else None)
return Token(token_type, value, start_pos, self.pos, keyword)
# Partial keyword (truncated)
while self.pos < self.length and self.jsonStr[self.pos].isalpha():
self.pos += 1
raw = self.jsonStr[start_pos:self.pos]
return Token(TokenType.TRUNCATED, raw, start_pos, self.pos, raw)
def nextToken(self) -> Token:
"""Get the next token"""
self.skipWhitespace()
if self.pos >= self.length:
return Token(TokenType.EOF, None, self.pos, self.pos, "")
char = self.jsonStr[self.pos]
startPos = self.pos
if char == '{':
self.pos += 1
return Token(TokenType.OBJECT_START, '{', startPos, self.pos, '{')
elif char == '}':
self.pos += 1
return Token(TokenType.OBJECT_END, '}', startPos, self.pos, '}')
elif char == '[':
self.pos += 1
return Token(TokenType.ARRAY_START, '[', startPos, self.pos, '[')
elif char == ']':
self.pos += 1
return Token(TokenType.ARRAY_END, ']', startPos, self.pos, ']')
elif char == ':':
self.pos += 1
return Token(TokenType.COLON, ':', startPos, self.pos, ':')
elif char == ',':
self.pos += 1
return Token(TokenType.COMMA, ',', startPos, self.pos, ',')
elif char == '"':
return self.readString()
elif char == '-' or char.isdigit():
return self.readNumber()
elif char.isalpha():
return self.readKeyword()
else:
# Unknown character, treat as truncated
self.pos += 1
return Token(TokenType.TRUNCATED, char, startPos, self.pos, char)
@dataclass
class HierarchyLevel:
"""Represents one level in the parsed hierarchy"""
type: str # "object" or "array"
start_pos: int
end_pos: int # -1 if not closed
key: Optional[str] # Key if this is a value in an object
index: Optional[int] # Index if this is in an array
content: dict # Parsed content at this level
raw_start: str # Raw string from start to children
children_content: List[Any] # For arrays: list of parsed elements
def getJsonContinuationContext(
truncatedJson: str,
budgetLimit: Optional[int] = None,
overlapMaxChars: Optional[int] = None
) -> Tuple[str, str, str]:
"""
Generate continuation contexts for a truncated JSON string.
Generiert drei Kontexte für abgeschnittene JSON-Strings:
1. Overlap Context: Das innerste Objekt/Array-Element, das den Cut-Punkt enthält
2. Hierarchy Context: Die hierarchische Struktur vom Root bis zum Cut mit Budget-Logik
3. Complete Part: Der vollständige Teil des JSONs mit allen Strukturen geschlossen
Args:
truncatedJson: The truncated JSON string
budgetLimit: Character budget for data values in hierarchy context (uses BUDGET_LIMIT if None)
overlapMaxChars: Maximum characters for overlap context (uses OVERLAP_MAX_CHARS if None)
Returns:
Tuple of (overlapContext, hierarchyContext, completePart):
- overlapContext: The innermost object/element containing the cut (for merging)
- hierarchyContext: Full structure from root to cut with budget-limited values
- completePart: Valid JSON with all structures properly closed
"""
if budgetLimit is None:
budgetLimit = BUDGET_LIMIT
if overlapMaxChars is None:
overlapMaxChars = OVERLAP_MAX_CHARS
analyzer = JsonAnalyzer(truncatedJson, budgetLimit, overlapMaxChars)
return analyzer.analyze()
class JsonAnalyzer:
"""
Analyzes truncated JSON and generates continuation contexts.
Generates three contexts for truncated JSON strings:
1. Overlap Context: The innermost object/array element containing the cut point
2. Hierarchy Context: The hierarchical structure from root to cut with budget logic
3. Complete Part: The complete part of the JSON with all structures properly closed
"""
def __init__(self, jsonStr: str, budgetLimit: Optional[int] = None, overlapMaxChars: Optional[int] = None):
self.jsonStr = jsonStr
self.budgetLimit = budgetLimit if budgetLimit is not None else BUDGET_LIMIT
self.overlapMaxChars = overlapMaxChars if overlapMaxChars is not None else OVERLAP_MAX_CHARS
self.stack: List[StackFrame] = []
self.hierarchy: List[dict] = [] # Parsed hierarchy info
def analyze(self) -> Tuple[str, str, str]:
"""
Analyze the truncated JSON and return all three contexts.
Returns:
Tuple of (overlapContext, hierarchyContext, completePart)
"""
# Parse and track the structure
self._parseStructure()
# Generate overlap context
overlapContext = self._generateOverlapContext()
# Generate hierarchy context (use improved version)
hierarchyContext = self._renderWithBudgetV2()
# Generate complete part (JSON with all structures closed)
completePart = self._generateCompletePart()
return overlapContext, hierarchyContext, completePart
def _generateCompletePart(self) -> str:
"""
Generate the complete part of the JSON with all structures properly closed.
This creates valid JSON by closing all open strings, brackets/braces.
Unvollständige Keys werden entfernt, damit das Ergebnis valides JSON ist.
Strategy:
1. Take the full truncated JSON
2. If we're in the middle of a string, close it
3. Remove incomplete key-value pairs (keys without values)
4. Close all open brackets/braces
"""
result = self.jsonStr.rstrip()
# Remove trailing comma if present (after stripping)
if result.endswith(','):
result = result[:-1]
# Check if we need to close an open string
stringClosing = self._getStringClosing(result)
result += stringClosing
# Check if we're in the middle of a key (after colon)
# If string was just closed and we're after a colon with no value, remove the key
result = self._cleanIncompleteKeyValue(result)
# Close all open structures
closingBrackets = self._getClosingBrackets(result)
return result + closingBrackets
def _getStringClosing(self, jsonStr: str) -> str:
"""Check if there's an unclosed string and return closing quote if needed."""
in_string = False
escaped = False
for char in jsonStr:
if escaped:
escaped = False
continue
if char == '\\' and in_string:
escaped = True
continue
if char == '"':
in_string = not in_string
return '"' if in_string else ""
def _cleanIncompleteKeyValue(self, jsonStr: str) -> str:
"""
Clean up incomplete key-value pairs.
Handles cases like:
- {"key": "incompl -> keep (valid truncated value)
- {"key": -> remove key
- {"a": 1, "key -> remove incomplete key (was in middle of key name)
"""
stripped = jsonStr.rstrip()
# Pattern: ends with colon (possibly with whitespace) - incomplete value
if stripped.endswith(':'):
# Find the start of this key and remove the whole key-value
return self._removeLastKey(stripped)
# Check if we just closed a string that was an incomplete key
# Pattern: ..., "something" or { "something" where something has no colon after
# This happens when we close a truncated key name like "add" -> "add"
if stripped.endswith('"'):
# Look for the pattern: comma/bracket + whitespace + "string"
# and check if this was supposed to be a key
if self._isIncompleteKey(stripped):
return self._removeLastKey(stripped)
return jsonStr
def _isIncompleteKey(self, jsonStr: str) -> bool:
"""
Check if the last string in the JSON is an incomplete key in an object.
This happens when truncation occurred in the middle of a key name.
Only applies to objects, not arrays.
"""
# Find the last complete string
pos = len(jsonStr) - 1
if jsonStr[pos] != '"':
return False
# Find the opening quote of this string
stringStart = pos - 1
while stringStart >= 0:
if jsonStr[stringStart] == '"':
# Check it's not escaped
numBackslashes = 0
checkPos = stringStart - 1
while checkPos >= 0 and jsonStr[checkPos] == '\\':
numBackslashes += 1
checkPos -= 1
if numBackslashes % 2 == 0:
break
stringStart -= 1
if stringStart < 0:
return False
# Now stringStart points to opening quote
# Check what's before it (skip whitespace)
beforePos = stringStart - 1
while beforePos >= 0 and jsonStr[beforePos] in ' \t\n\r':
beforePos -= 1
if beforePos < 0:
return False
# For this to be an incomplete key, it must be preceded by { or ,
# AND we must be inside an object (not an array)
if jsonStr[beforePos] not in ',{':
return False
# Now check if we're in an object context (not array)
# Count open braces/brackets to determine context
braceCount = 0
bracketCount = 0
inString = False
for i in range(beforePos + 1):
char = jsonStr[i]
if char == '"' and (i == 0 or jsonStr[i-1] != '\\'):
inString = not inString
elif not inString:
if char == '{':
braceCount += 1
elif char == '}':
braceCount -= 1
elif char == '[':
bracketCount += 1
elif char == ']':
bracketCount -= 1
# If we have more open braces than brackets at this point,
# we're in an object context
# Actually, we need to check the innermost container
# Let's track the stack properly
stack = []
inString = False
for i in range(beforePos + 1):
char = jsonStr[i]
if char == '"' and (i == 0 or jsonStr[i-1] != '\\'):
inString = not inString
elif not inString:
if char == '{':
stack.append('object')
elif char == '[':
stack.append('array')
elif char == '}':
if stack and stack[-1] == 'object':
stack.pop()
elif char == ']':
if stack and stack[-1] == 'array':
stack.pop()
# If innermost container is an object, this is an incomplete key
return len(stack) > 0 and stack[-1] == 'object'
def _removeLastKey(self, jsonStr: str) -> str:
"""Remove the last incomplete key-value pair from the JSON string."""
stripped = jsonStr.rstrip()
# Find the last comma or opening bracket before the incomplete key
pos = len(stripped) - 1
# Skip past the current string/key
in_string = False
while pos >= 0:
char = stripped[pos]
if char == '"' and (pos == 0 or stripped[pos-1] != '\\'):
in_string = not in_string
if not in_string and char in ',{':
break
pos -= 1
if pos < 0:
return stripped
if stripped[pos] == ',':
# Remove from comma onwards
return stripped[:pos]
elif stripped[pos] == '{':
# Keep the opening brace
return stripped[:pos+1]
return stripped
def _findLastCompletePosition(self) -> int:
"""Find the position of the last complete value in the JSON."""
tokenizer = JsonTokenizer(self.jsonStr)
last_complete_pos = 0
stack_depth = 0
last_value_end = 0
in_value = False
while True:
token = tokenizer.nextToken()
if token.type == TokenType.EOF:
break
if token.type == TokenType.TRUNCATED:
# Return position before the truncated part
break
if token.type in (TokenType.OBJECT_START, TokenType.ARRAY_START):
stack_depth += 1
in_value = True
elif token.type in (TokenType.OBJECT_END, TokenType.ARRAY_END):
stack_depth -= 1
last_value_end = token.end_pos
in_value = False
elif token.type == TokenType.STRING:
# Check if this is a key or a value
saved_pos = tokenizer.pos
tokenizer.skipWhitespace()
next_char = tokenizer.peek()
tokenizer.pos = saved_pos
if next_char != ':':
# It's a value
last_value_end = token.end_pos
in_value = False
elif token.type in (TokenType.NUMBER, TokenType.BOOLEAN, TokenType.NULL):
last_value_end = token.end_pos
in_value = False
elif token.type == TokenType.COMMA:
# After a comma, we've completed a value
last_complete_pos = last_value_end
# Return the last complete position
return last_value_end if last_value_end > 0 else len(self.jsonStr)
def _getClosingBrackets(self, jsonStr: str) -> str:
"""Determine what closing brackets are needed."""
stack = []
in_string = False
escaped = False
for char in jsonStr:
if escaped:
escaped = False
continue
if char == '\\' and in_string:
escaped = True
continue
if char == '"':
in_string = not in_string
continue
if in_string:
continue
if char == '{':
stack.append('}')
elif char == '[':
stack.append(']')
elif char == '}':
if stack and stack[-1] == '}':
stack.pop()
elif char == ']':
if stack and stack[-1] == ']':
stack.pop()
# Return closing brackets in reverse order
return ''.join(reversed(stack))
def _parseStructure(self):
"""Parse the JSON structure and track hierarchy"""
tokenizer = JsonTokenizer(self.jsonStr)
while True:
token = tokenizer.nextToken()
if token.type == TokenType.EOF or token.type == TokenType.TRUNCATED:
break
if token.type == TokenType.OBJECT_START:
frame = StackFrame(
type="object",
start_pos=token.start_pos,
keys_seen=[]
)
self.stack.append(frame)
elif token.type == TokenType.ARRAY_START:
frame = StackFrame(
type="array",
start_pos=token.start_pos,
index=0
)
self.stack.append(frame)
elif token.type == TokenType.OBJECT_END:
if self.stack and self.stack[-1].type == "object":
self.stack.pop()
elif token.type == TokenType.ARRAY_END:
if self.stack and self.stack[-1].type == "array":
self.stack.pop()
elif token.type == TokenType.STRING:
# Could be a key or a value
self._handleStringToken(token, tokenizer)
elif token.type == TokenType.COMMA:
# Increment array index
if self.stack and self.stack[-1].type == "array":
self.stack[-1].index += 1
def _handleStringToken(self, token: Token, tokenizer: JsonTokenizer):
"""Handle a string token (could be key or value)"""
if self.stack and self.stack[-1].type == "object":
# Check if this is a key (followed by colon)
saved_pos = tokenizer.pos
tokenizer.skipWhitespace()
next_char = tokenizer.peek()
if next_char == ':':
# This is a key
self.stack[-1].key = token.value
self.stack[-1].keys_seen.append(token.value)
tokenizer.pos = saved_pos
def _generateOverlapContext(self) -> str:
"""
Generate the overlap context - the innermost object/array element containing the cut.
Returns the raw string from the start of that element to the end of the truncated JSON.
Dieser Kontext wird verwendet, um den abgeschnittenen Teil mit dem neuen Teil zu mergen.
Exakt so wie im Original-String (für String-Matching beim Merge).
"""
if not self.stack:
# No structure, return last overlap_max_chars characters
return self.jsonStr[-self.overlapMaxChars:]
# Find the innermost container that should be the overlap
# For arrays: the current array element
# For objects: the current object
innermost_start = self._findInnermostElementStart()
overlap = self.jsonStr[innermost_start:]
# Apply max chars limit
if len(overlap) > self.overlapMaxChars:
overlap = self.jsonStr[-self.overlapMaxChars:]
return overlap
def _findInnermostElementStart(self) -> int:
"""Find the start position of the innermost element for overlap"""
if not self.stack:
return max(0, len(self.jsonStr) - self.overlapMaxChars)
# Walk through stack to find the innermost array element or object
# We want the innermost "atomic" unit that contains the cut
# Strategy:
# - If innermost is an object: return its start
# - If innermost is an array:
# - If current element is an object/array: return start of that element
# - If current element is a primitive: return start of array or last N chars
innermost = self.stack[-1]
if innermost.type == "object":
return innermost.start_pos
else:
# It's an array - find the start of the current element
element_start = self._findArrayElementStart(innermost)
# Check if the element is a primitive or complex type
element_content = self.jsonStr[element_start:].strip()
# If it starts with { or [ it's complex, return the element start
if element_content and element_content[0] in '{[':
return element_start
else:
# Primitive in array - check if there's a parent object
# or return overlap_max_chars from end
for i in range(len(self.stack) - 2, -1, -1):
if self.stack[i].type == "object":
return self.stack[i].start_pos
# No parent object, return max chars from end
return max(0, len(self.jsonStr) - self.overlapMaxChars)
def _findArrayElementStart(self, arrayFrame: StackFrame) -> int:
"""Find the start position of the current array element"""
# We need to find the start of the current element in the array
# Parse from array start to find element boundaries
arrayContent = self.jsonStr[arrayFrame.start_pos:]
# Skip the opening bracket and whitespace
pos = 1
while pos < len(arrayContent) and arrayContent[pos] in ' \t\n\r':
pos += 1
elementStarts = [arrayFrame.start_pos + pos]
depth = 0
inString = False
escaped = False
i = pos
while i < len(arrayContent):
char = arrayContent[i]
if escaped:
escaped = False
i += 1
continue
if char == '\\' and inString:
escaped = True
i += 1
continue
if char == '"':
inString = not inString
i += 1
continue
if inString:
i += 1
continue
if char in '{[':
depth += 1
elif char in '}]':
depth -= 1
elif char == ',' and depth == 0:
# Found element boundary
i += 1
# Skip whitespace
while i < len(arrayContent) and arrayContent[i] in ' \t\n\r':
i += 1
elementStarts.append(arrayFrame.start_pos + i)
i += 1
# Return the start of the current element
if arrayFrame.index < len(elementStarts):
return elementStarts[arrayFrame.index]
elif elementStarts:
return elementStarts[-1]
else:
return arrayFrame.start_pos
def _generateHierarchyContext(self) -> str:
"""
Generate the hierarchy context with budget logic.
Shows structure from root to cut point with data values limited by budget.
"""
if not self.stack:
# No structure
return self.jsonStr[-self.overlapMaxChars:]
# We need to rebuild the JSON with budget logic
# Priority: elements closer to cut get full values, distant ones get "..."
return self._rebuildWithBudget()
def _rebuildWithBudget(self) -> str:
"""Rebuild JSON from root to cut with budget constraints"""
# Strategy:
# 1. Parse the JSON structure tracking all values
# 2. Calculate total value size
# 3. Apply budget from cut backwards
# 4. Render with "..." for values outside budget
# First, get a structured representation
structure = self._parseForHierarchy()
# Now render with budget
return self._renderWithBudget(structure)
def _parseForHierarchy(self) -> dict:
"""Parse JSON into a structure suitable for hierarchy rendering"""
result = {
'type': 'root',
'children': [],
'raw_positions': []
}
tokenizer = JsonTokenizer(self.jsonStr)
stack = [result]
current_key = None
while True:
token = tokenizer.nextToken()
if token.type == TokenType.EOF:
break
if token.type == TokenType.TRUNCATED:
# Mark the truncation point
if stack:
current = stack[-1]
if current.get('type') == 'object':
if current_key:
current['children'].append({
'type': 'truncated_value',
'key': current_key,
'raw': self.jsonStr[token.start_pos:],
'start_pos': token.start_pos
})
elif current.get('type') == 'array':
current['children'].append({
'type': 'truncated_value',
'raw': self.jsonStr[token.start_pos:],
'start_pos': token.start_pos
})
break
if token.type == TokenType.OBJECT_START:
obj = {
'type': 'object',
'key': current_key,
'children': [],
'start_pos': token.start_pos
}
if stack:
stack[-1]['children'].append(obj)
stack.append(obj)
current_key = None
elif token.type == TokenType.ARRAY_START:
arr = {
'type': 'array',
'key': current_key,
'children': [],
'start_pos': token.start_pos
}
if stack:
stack[-1]['children'].append(arr)
stack.append(arr)
current_key = None
elif token.type == TokenType.OBJECT_END:
if len(stack) > 1 and stack[-1].get('type') == 'object':
stack[-1]['end_pos'] = token.end_pos
stack[-1]['complete'] = True
stack.pop()
elif token.type == TokenType.ARRAY_END:
if len(stack) > 1 and stack[-1].get('type') == 'array':
stack[-1]['end_pos'] = token.end_pos
stack[-1]['complete'] = True
stack.pop()
elif token.type == TokenType.STRING:
# Check if it's a key
saved_pos = tokenizer.pos
tokenizer.skipWhitespace()
next_char = tokenizer.peek()
if next_char == ':' and stack and stack[-1].get('type') == 'object':
current_key = token.value
else:
# It's a value
value_node = {
'type': 'value',
'key': current_key,
'value': token.value,
'raw': token.raw,
'start_pos': token.start_pos,
'end_pos': token.end_pos,
'value_type': 'string'
}
if stack:
stack[-1]['children'].append(value_node)
current_key = None
tokenizer.pos = saved_pos
elif token.type in (TokenType.NUMBER, TokenType.BOOLEAN, TokenType.NULL):
value_node = {
'type': 'value',
'key': current_key,
'value': token.value,
'raw': token.raw,
'start_pos': token.start_pos,
'end_pos': token.end_pos,
'value_type': str(token.type.value)
}
if stack:
stack[-1]['children'].append(value_node)
current_key = None
return result
def _renderWithBudget(self, structure: dict) -> str:
"""Render the structure with budget constraints"""
# First, collect all value nodes with their distances from cut
cutPos = len(self.jsonStr)
allValues = self._collectValuesWithDistance(structure, cutPos)
# Sort by distance (closest to cut first)
allValues.sort(key=lambda x: x['distance'])
# Determine which values get full rendering
budgetRemaining = self.budgetLimit
valuesWithBudget = set()
for valInfo in allValues:
valSize = len(str(valInfo['raw']))
if budgetRemaining >= valSize:
valuesWithBudget.add(valInfo['id'])
budgetRemaining -= valSize
# Now render the structure
return self._renderNode(structure, valuesWithBudget, indent=0)
def _collectValuesWithDistance(self, node: dict, cutPos: int, depth: int = 0) -> list:
"""Collect all value nodes with their distance from cut point"""
values = []
if node.get('type') == 'value':
endPos = node.get('end_pos', cutPos)
distance = cutPos - endPos
values.append({
'id': id(node),
'node': node,
'distance': distance,
'raw': node.get('raw', ''),
'depth': depth
})
elif node.get('type') == 'truncated_value':
values.append({
'id': id(node),
'node': node,
'distance': 0, # Truncated values are at the cut
'raw': node.get('raw', ''),
'depth': depth
})
for child in node.get('children', []):
values.extend(self._collectValuesWithDistance(child, cutPos, depth + 1))
return values
def _renderNode(self, node: dict, valuesWithBudget: set, indent: int = 0) -> str:
"""Render a node with budget constraints"""
indent_str = " " * indent
node_type = node.get('type')
if node_type == 'root':
parts = []
for child in node.get('children', []):
parts.append(self._renderNode(child, valuesWithBudget, indent))
return '\n'.join(parts)
elif node_type == 'object':
return self._renderObject(node, valuesWithBudget, indent)
elif node_type == 'array':
return self._renderArray(node, valuesWithBudget, indent)
elif node_type == 'value':
return self._renderValue(node, valuesWithBudget, indent)
elif node_type == 'truncated_value':
return node.get('raw', '')
return ''
def _renderObject(self, node: dict, valuesWithBudget: set, indent: int) -> str:
"""Render an object node"""
indent_str = " " * indent
inner_indent = " " * (indent + 1)
key_prefix = ""
if node.get('key'):
key_prefix = f'"{node["key"]}": '
if not node.get('children'):
if node.get('complete'):
return f"{key_prefix}{{}}"
else:
return f"{key_prefix}{{"
parts = [f"{key_prefix}{{"]
children = node.get('children', [])
for i, child in enumerate(children):
child_rendered = self._renderNode(child, valuesWithBudget, indent + 1)
# Add comma if not last and next sibling exists
if i < len(children) - 1:
if child.get('type') != 'truncated_value':
parts.append(f"{inner_indent}{child_rendered},")
else:
parts.append(f"{inner_indent}{child_rendered}")
else:
parts.append(f"{inner_indent}{child_rendered}")
if node.get('complete'):
parts.append(f"{indent_str}}}")
return '\n'.join(parts)
def _renderArray(self, node: dict, valuesWithBudget: set, indent: int) -> str:
"""Render an array node"""
indent_str = " " * indent
inner_indent = " " * (indent + 1)
key_prefix = ""
if node.get('key'):
key_prefix = f'"{node["key"]}": '
if not node.get('children'):
if node.get('complete'):
return f"{key_prefix}[]"
else:
return f"{key_prefix}["
parts = [f"{key_prefix}["]
children = node.get('children', [])
for i, child in enumerate(children):
child_rendered = self._renderNode(child, valuesWithBudget, indent + 1)
if i < len(children) - 1:
if child.get('type') != 'truncated_value':
parts.append(f"{inner_indent}{child_rendered},")
else:
parts.append(f"{inner_indent}{child_rendered}")
else:
parts.append(f"{inner_indent}{child_rendered}")
if node.get('complete'):
parts.append(f"{indent_str}]")
return '\n'.join(parts)
def _renderValue(self, node: dict, valuesWithBudget: set, indent: int) -> str:
"""Render a value node"""
key_prefix = ""
if node.get('key'):
key_prefix = f'"{node["key"]}": '
if id(node) in valuesWithBudget:
# Full value
default_raw = '"...\"'
raw_value = node.get('raw', default_raw)
return f"{key_prefix}{raw_value}"
else:
# Placeholder
return f'{key_prefix}"..."'
def _renderWithBudgetV2(self) -> str:
"""
Generate hierarchy context with budget logic.
Alternative rendering that stays closer to the original truncated string.
Shows full context near the cut, replaces distant values with "...".
Budget-Logik:
1. Sammeln: Alle String-Werte werden mit ihrer Position gesammelt
2. Sortieren: Nach Entfernung zum Cut-Punkt (näher = höhere Priorität)
3. Zuweisen: Budget wird von hinten nach vorne aufgebraucht
4. Ersetzen: Werte außerhalb des Budgets werden durch "..." ersetzt
"""
# Parse to understand structure, but render from original string with modifications
structure = self._parseForHierarchy()
# Collect all complete value nodes with positions
allValues = self._collectCompleteValues(structure)
# Sort by end position (furthest from cut = first to be truncated)
allValues.sort(key=lambda x: x['end_pos'])
# Apply budget: replace values from the start until budget exhausted
budgetUsed = 0
totalAvailable = sum(len(v['raw']) for v in allValues)
valuesToReplace = []
for val in allValues:
valSize = len(val['raw'])
if totalAvailable - budgetUsed > self.budgetLimit:
# This value should be replaced with "..."
valuesToReplace.append(val)
budgetUsed += valSize
else:
break
# Build the modified string
result = self.jsonStr
# Replace from end to start to preserve positions
valuesToReplace.sort(key=lambda x: x['start_pos'], reverse=True)
for val in valuesToReplace:
start = val['start_pos']
end = val['end_pos']
result = result[:start] + '"..."' + result[end:]
return result
def _collectCompleteValues(self, node: dict) -> list:
"""Collect all complete (non-truncated) value nodes"""
values = []
if node.get('type') == 'value' and node.get('value_type') == 'string':
values.append({
'start_pos': node['start_pos'],
'end_pos': node['end_pos'],
'raw': node['raw'],
'key': node.get('key')
})
for child in node.get('children', []):
values.extend(self._collectCompleteValues(child))
return values
def extractContinuationContexts(
truncatedJson: str
) -> Tuple[str, str, str]:
"""
Main entry point: Extract all three continuation contexts from a truncated JSON.
Generiert drei Kontexte für abgeschnittene JSON-Strings:
1. Overlap Context: Das innerste Objekt/Array-Element, das den Cut-Punkt enthält
- Wird verwendet, um den abgeschnittenen Teil mit dem neuen Teil zu mergen
- Exakt so wie im Original-String (für String-Matching beim Merge)
2. Hierarchy Context: Die hierarchische Struktur vom Root bis zum Cut-Punkt
- Mit Budget-Logik: Näher am Cut = vollständige Werte, weiter weg = "..." Platzhalter
- Gibt der AI den Kontext der gesamten JSON-Struktur
3. Complete Part: Der vollständige, valide JSON bis zum Cut-Punkt
- Alle offenen Strukturen werden geschlossen (}, ], ")
- Unvollständige Keys werden entfernt
- Kann direkt als valides JSON geparst werden
Uses module constants BUDGET_LIMIT and OVERLAP_MAX_CHARS.
Args:
truncatedJson: The truncated JSON string
Returns:
Tuple of (overlapContext, hierarchyContext, completePart):
- overlapContext: The innermost object/element containing the cut (for merging)
- hierarchyContext: Full structure from root to cut with budget-limited values
- completePart: Valid JSON with all structures properly closed
Example:
>>> jsonStr = '{"users": [{"name": "John", "bio": "Hello Wor'
>>> overlap, hierarchy, complete = extractContinuationContexts(jsonStr)
>>> import json
>>> parsed = json.loads(complete) # ✓ Funktioniert!
"""
return getJsonContinuationContext(truncatedJson)
# Convenience function with named results
def getContexts(
truncatedJson: str
) -> JsonContinuationContexts:
"""
Get all contexts as a Pydantic model with named fields.
Uses module constants BUDGET_LIMIT and OVERLAP_MAX_CHARS.
Args:
truncatedJson: The truncated JSON string
Returns:
JsonContinuationContexts Pydantic model with:
- overlapContext: The innermost object/element containing the cut
- hierarchyContext: Full structure with budget-limited values
- completePart: Valid JSON with all structures properly closed
Example:
>>> json_str = '{"users": [{"name": "John", "bio": "Hello Wor'
>>> contexts = getContexts(json_str)
>>> print(contexts.overlapContext)
>>> print(contexts.hierarchyContext)
>>> print(contexts.completePart)
"""
overlap, hierarchy, completePart = extractContinuationContexts(truncatedJson)
return JsonContinuationContexts(
overlapContext=overlap,
hierarchyContext=hierarchy,
completePart=completePart
)