gateway/modules/shared/jsonUtils.py

import json
import logging
from typing import Any, Dict, List, Optional, Tuple, Union, Type, TypeVar
from pydantic import BaseModel, ValidationError

logger = logging.getLogger(__name__)

T = TypeVar('T', bound=BaseModel)


def stripCodeFences(text: str) -> str:
    """Remove ```json / ``` fences and surrounding whitespace if present."""
    if not text:
        return text
    s = text.strip()
    if s.startswith("```") and s.endswith("```"):
        # Remove first/last triple backticks
        # Commonly starts with ```json\n
        # Strip opening backticks
        i = 3
        # Skip optional language tag like 'json'
        while i < len(s) and s[i] != '\n':
            i += 1
        if i < len(s) and s[i] == '\n':
            s = s[i+1:]
        # Strip trailing ```
        if s.endswith("```"):
            s = s[:-3]
        return s.strip()
    return s


def extractFirstBalancedJson(text: str) -> str:
    """Return the first balanced JSON object/array substring; otherwise return trimmed input."""
    if not text:
        return text
    s = text.strip()
    # Find first '{' or '['
    brace = s.find('{')
    bracket = s.find('[')
    start = -1
    if brace != -1 and (bracket == -1 or brace < bracket):
        start = brace
    elif bracket != -1:
        start = bracket
    if start == -1:
        return s
    # Scan for matching close using a simple stack
    stack: List[str] = []
    for i in range(start, len(s)):
        ch = s[i]
        if ch in '{[':
            stack.append(ch)
        elif ch in '}]':
            if not stack:
                continue
            opener = stack.pop()
            if (opener == '{' and ch != '}') or (opener == '[' and ch != ']'):
                continue
            if not stack:
                return s[start:i+1].strip()
    return s


def normalizeJsonText(text: str) -> str:
    """Light normalization: remove BOM, normalize smart quotes."""
    if not text:
        return text
    s = text
    # Remove UTF-8 BOM if present
    if s.startswith('\ufeff'):
        s = s.lstrip('\ufeff')
    # Normalize smart quotes to straight quotes
    s = s.replace('“', '"').replace('”', '"').replace('’', "'").replace('‘', "'")
    return s


def extractJsonString(text: str) -> str:
    """Strip code fences, normalize, then extract first balanced JSON substring."""
    s = normalizeJsonText(text)
    s = stripCodeFences(s)
    s = extractFirstBalancedJson(s)
    return s.strip()


def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]], Optional[Exception], str]:
    """Extract and parse JSON; return (obj, error, cleaned_str)."""
    if isinstance(text, bytes):
        try:
            text = text.decode('utf-8', errors='replace')
        except Exception:
            text = str(text)
    cleaned = extractJsonString(text or "")
    try:
        return json.loads(cleaned), None, cleaned
    except Exception as e:
        return None, e, cleaned


def parseJsonOrRaise(text: Union[str, bytes]) -> Union[Dict, List]:
    obj, err, cleaned = tryParseJson(text)
    if err is not None:
        logger.error(f"parse_json_or_raise failed: {err}. Cleaned preview: {cleaned[:200]}...")
        raise err
    return obj


def mergeRootLists(jsonParts: List[Union[str, Dict, List]]) -> Dict[str, Any]:
    """
    Generic merger for root-level lists: take first dict as base; for each subsequent part:
    - if value is list and same key exists as list, extend it
    - if key absent, add it
    - for non-list keys, keep the original (from the first part)
    Sets continuation=None if present in base.
    """
    base: Optional[Dict[str, Any]] = None
    parsed: List[Dict[str, Any]] = []
    for part in jsonParts:
        if isinstance(part, (dict, list)):
            obj = part
        else:
            obj, err, _ = tryParseJson(part)
            if err is not None or not isinstance(obj, (dict, list)):
                continue
        if isinstance(obj, dict):
            parsed.append(obj)
    if not parsed:
        return {}
    base = dict(parsed[0])
    for obj in parsed[1:]:
        for k, v in obj.items():
            if isinstance(v, list) and isinstance(base.get(k), list):
                base[k].extend(v)
            elif k not in base:
                base[k] = v
    if 'continuation' in base:
        base['continuation'] = None
    return base


def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
    """
    Attempt to repair broken JSON using multiple strategies.
    Generic solution that works for any content type.
    Returns the best repair attempt or None if all fail.
    """
    if not text:
        return None

    # Strategy 1: Try to extract sections from the entire text first
    # This handles cases where the JSON structure is broken but content is intact
    extractedSections = _extractSectionsRegex(text)
    if extractedSections:
        logger.info(f"Extracted {len(extractedSections)} sections using regex")
        return {
            "metadata": {
                "split_strategy": "single_document",
                "source_documents": [],
                "extraction_method": "ai_generation"
            },
            "documents": [{"sections": extractedSections}]
        }

    # Strategy 2: Progressive parsing - try to find longest valid prefix
    bestResult = None
    bestValidLength = 0

    # Try different step sizes to find the best valid JSON
    for stepSize in [100, 50, 10, 1]:
        for i in range(len(text), 0, -stepSize):
            testStr = text[:i]
            closedStr = _closeJsonStructures(testStr)
            obj, err, _ = tryParseJson(closedStr)
            if err is None and isinstance(obj, dict):
                bestResult = obj
                bestValidLength = i
                logger.debug(f"Progressive parsing success at length {i} (step: {stepSize})")
                break
        if bestResult:
            break

    if bestResult:
        logger.info(f"Repaired JSON using progressive parsing (valid length: {bestValidLength})")

        # Check if we have sections in the result
        sections = extractSectionsFromDocument(bestResult)
        if sections:
            logger.info(f"Progressive parsing found {len(sections)} sections")
            return bestResult
        else:
            # No sections found in progressive parsing, try to extract from broken part
            logger.info("Progressive parsing found no sections, trying to extract from broken part")
            extractedSections = _extractSectionsRegex(text[bestValidLength:])
            if extractedSections:
                logger.info(f"Extracted {len(extractedSections)} sections from broken part")
                # Merge with the valid part
                if "documents" not in bestResult:
                    bestResult["documents"] = []
                if not bestResult["documents"]:
                    bestResult["documents"] = [{"sections": []}]
                bestResult["documents"][0]["sections"].extend(extractedSections)
                return bestResult

    # Strategy 3: Structure closing - close incomplete structures
    closedStr = _closeJsonStructures(text)
    obj, err, _ = tryParseJson(closedStr)
    if err is None and isinstance(obj, dict):
        logger.info("Repaired JSON using structure closing")
        return obj

    logger.warning("All repair strategies failed")
    return None


def _closeJsonStructures(text: str) -> str:
    """
    Close incomplete JSON structures by adding missing closing brackets.
    """
    if not text:
        return text

    # Count open/close brackets and braces
    openBraces = text.count('{')
    closeBraces = text.count('}')
    openBrackets = text.count('[')
    closeBrackets = text.count(']')

    # Close incomplete structures
    result = text
    for _ in range(openBraces - closeBraces):
        result += '}'
    for _ in range(openBrackets - closeBrackets):
        result += ']'

    return result


def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
    """
    Extract sections from broken JSON using regex patterns.
    Generic solution that works for any content type.
    """
    import re

    sections = []

    # Pattern to find section objects
    sectionPattern = r'"id"\s*:\s*"(section_\d+)"\s*,?\s*"content_type"\s*:\s*"(\w+)"\s*,?\s*"order"\s*:\s*(\d+)'

    for match in re.finditer(sectionPattern, text, re.IGNORECASE):
        sectionId = match.group(1)
        contentType = match.group(2)
        order = int(match.group(3))

        # Try to extract elements array - look for the elements array after this section
        elementsMatch = re.search(
            r'"elements"\s*:\s*\[(.*?)\]',
            text[match.end():match.end()+5000]  # Look ahead for elements (large range)
        )

        elements = []
        if elementsMatch:
            try:
                elementsStr = '[' + elementsMatch.group(1) + ']'
                elements = json.loads(elementsStr)
            except:
                # If JSON parsing fails, try to extract individual items manually
                elementsText = elementsMatch.group(1)
                elements = _extractElementsFromText(elementsText, contentType)

        sections.append({
            "id": sectionId,
            "content_type": contentType,
            "elements": elements,
            "order": order
        })

    # If no sections found with the main pattern, try to find any content patterns
    if not sections:
        sections = _extractGenericContent(text)

    return sections


def _extractElementsFromText(elementsText: str, contentType: str) -> List[Dict[str, Any]]:
    """
    Extract elements from text when JSON parsing fails.
    Generic approach that works for any content type.
    Handles incomplete strings and corrupted data.
    Excludes the last incomplete item to prevent corrupted data.
    """
    import re

    elements = []

    if contentType == "list":
        # Look for {"text": "..."} patterns, including incomplete ones
        text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elementsText)
        # Also look for incomplete patterns like {"text": "36
        incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elementsText)

        # Combine both complete and incomplete items
        all_items = text_items + incomplete_items
        # Remove duplicates and empty strings
        unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_items:
            unique_items = _removeLastIncompleteItem(unique_items, elementsText)

        elements = [{"text": item} for item in unique_items]

    elif contentType == "paragraph":
        # Look for {"text": "..."} patterns, including incomplete ones
        text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', elementsText)
        incomplete_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', elementsText)

        all_items = text_items + incomplete_items
        unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_items:
            unique_items = _removeLastIncompleteItem(unique_items, elementsText)

        elements = [{"text": item} for item in unique_items]

    elif contentType == "heading":
        # Look for {"level": X, "text": "..."} patterns, including incomplete ones
        heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*)"\}', elementsText)
        incomplete_heading_items = re.findall(r'\{"level"\s*:\s*(\d+)\s*,\s*"text"\s*:\s*"([^"]*?)(?:\n|$)', elementsText)

        all_items = heading_items + incomplete_heading_items
        unique_items = list(dict.fromkeys([(int(level), text) for level, text in all_items if text.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_items:
            unique_items = _removeLastIncompleteItem(unique_items, elementsText)

        elements = [{"level": level, "text": text} for level, text in unique_items]

    elif contentType == "table":
        # Look for table patterns
        table_items = re.findall(r'\{"headers"\s*:\s*\[(.*?)\]\s*,\s*"rows"\s*:\s*\[(.*?)\]\s*,\s*"caption"\s*:\s*"([^"]*)"\}', elementsText)
        for headers_str, rows_str, caption in table_items:
            # Extract headers
            headers = re.findall(r'"([^"]+)"', headers_str)
            # Extract rows (simplified)
            rows = []
            row_matches = re.findall(r'\[(.*?)\]', rows_str)
            for row_match in row_matches:
                row_items = re.findall(r'"([^"]+)"', row_match)
                rows.append(row_items)

            elements.append({
                "headers": headers,
                "rows": rows,
                "caption": caption
            })

    elif contentType == "code":
        # Look for {"code": "...", "language": "..."} patterns, including incomplete ones
        code_items = re.findall(r'\{"code"\s*:\s*"([^"]*)"\s*,\s*"language"\s*:\s*"([^"]*)"\}', elementsText)
        incomplete_code_items = re.findall(r'\{"code"\s*:\s*"([^"]*?)(?:\n|$)', elementsText)

        all_items = code_items + [(code, "unknown") for code in incomplete_code_items]
        unique_items = list(dict.fromkeys([(code, lang) for code, lang in all_items if code.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_items:
            unique_items = _removeLastIncompleteItem(unique_items, elementsText)

        elements = [{"code": code, "language": lang} for code, lang in unique_items]

    else:
        # Generic fallback - look for any text content, including incomplete
        text_items = re.findall(r'"text"\s*:\s*"([^"]*)"', elementsText)
        incomplete_text_items = re.findall(r'"text"\s*:\s*"([^"]*?)(?:\n|$)', elementsText)

        all_items = text_items + incomplete_text_items
        unique_items = list(dict.fromkeys([item for item in all_items if item.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_items:
            unique_items = _removeLastIncompleteItem(unique_items, elementsText)

        elements = [{"text": item} for item in unique_items]

    return elements


def _removeLastIncompleteItem(items: List[str], original_text: str) -> List[str]:
    """
    Remove the last item if it appears to be incomplete/corrupted.
    This prevents corrupted data from being included in the final result.
    """
    import re

    if not items:
        return items

    # Check if the original text ends with incomplete JSON patterns
    # Look for patterns that suggest the last item was cut off

    # Pattern 1: Text ends with incomplete string like {"text": "36
    if re.search(r'\{"[^"]*"\s*:\s*"[^"]*$', original_text):
        logger.debug("Detected incomplete string at end - removing last item")
        return items[:-1]

    # Pattern 2: Text ends with incomplete boolean like {"bool_flag": tr
    if re.search(r'\{"[^"]*"\s*:\s*(true|false|tr|fa)$', original_text):
        logger.debug("Detected incomplete boolean at end - removing last item")
        return items[:-1]

    # Pattern 3: Text ends with incomplete number like {"number": 123
    if re.search(r'\{"[^"]*"\s*:\s*\d+$', original_text):
        logger.debug("Detected incomplete number at end - removing last item")
        return items[:-1]

    # Pattern 4: Text ends with incomplete array like {"array": [1,2,3
    if re.search(r'\{"[^"]*"\s*:\s*\[[^\]]*$', original_text):
        logger.debug("Detected incomplete array at end - removing last item")
        return items[:-1]

    # Pattern 5: Text ends with incomplete object like {"obj": {"key": "val
    if re.search(r'\{"[^"]*"\s*:\s*\{[^}]*$', original_text):
        logger.debug("Detected incomplete object at end - removing last item")
        return items[:-1]

    # Pattern 6: Text ends with trailing comma (common sign of incomplete JSON)
    if original_text.rstrip().endswith(','):
        logger.debug("Detected trailing comma - removing last item")
        return items[:-1]

    # If no incomplete patterns detected, return all items
    return items


def _extractGenericContent(text: str) -> List[Dict[str, Any]]:
    """
    Extract generic content when no specific section patterns are found.
    This handles cases where the JSON structure is completely broken.
    Handles incomplete strings and corrupted data.
    Excludes the last incomplete item to prevent corrupted data.
    """
    import re

    sections = []

    # Look for any structured content patterns
    # Pattern 1: Look for list items {"text": "..."}, including incomplete ones
    list_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text)
    incomplete_list_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text)

    all_list_items = list_items + incomplete_list_items
    unique_list_items = list(dict.fromkeys([item for item in all_list_items if item.strip()]))

    # Remove the last item if it appears to be incomplete/corrupted
    if unique_list_items:
        unique_list_items = _removeLastIncompleteItem(unique_list_items, text)

    if unique_list_items:
        elements = [{"text": item} for item in unique_list_items]
        sections.append({
            "id": "section_1",
            "content_type": "list",
            "elements": elements,
            "order": 1
        })

    # Pattern 2: Look for paragraph text {"text": "..."}, including incomplete ones
    elif re.search(r'\{"text"\s*:\s*"[^"]*\}', text):
        # Extract all text elements, including incomplete ones
        text_items = re.findall(r'\{"text"\s*:\s*"([^"]*)"\}', text)
        incomplete_text_items = re.findall(r'\{"text"\s*:\s*"([^"]*?)(?:\n|$)', text)

        all_text_items = text_items + incomplete_text_items
        unique_text_items = list(dict.fromkeys([item for item in all_text_items if item.strip()]))

        # Remove the last item if it appears to be incomplete/corrupted
        if unique_text_items:
            unique_text_items = _removeLastIncompleteItem(unique_text_items, text)

        if unique_text_items:
            elements = [{"text": item} for item in unique_text_items]
            sections.append({
                "id": "section_1",
                "content_type": "paragraph",
                "elements": elements,
                "order": 1
            })

    # Pattern 3: Look for any quoted strings that might be content, including incomplete ones
    elif re.search(r'"([^"]{3,})"', text):  # Strings longer than 3 chars (reduced threshold)
        # Extract longer quoted strings, including incomplete ones
        text_items = re.findall(r'"([^"]{3,})"', text)
        incomplete_text_items = re.findall(r'"([^"]{3,}?)(?:\n|$)', text)

        all_text_items = text_items + incomplete_text_items
        # Filter out likely JSON keys
        content_items = [item for item in all_text_items if not item.startswith(('section_', 'doc_', 'metadata', 'split_strategy', 'source_documents', 'extraction_method', 'id', 'content_type', 'elements', 'order', 'title', 'filename'))]

        # Remove the last item if it appears to be incomplete/corrupted
        if content_items:
            content_items = _removeLastIncompleteItem(content_items, text)

        if content_items:
            elements = [{"text": item} for item in content_items[:10]]  # Limit to first 10 items
            sections.append({
                "id": "section_1",
                "content_type": "paragraph",
                "elements": elements,
                "order": 1
            })

    return sections


def extractSectionsFromDocument(documentData: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extract all sections from document data structure.
    Handles both flat and nested document structures.
    """
    if not isinstance(documentData, dict):
        return []

    # Try to extract sections from documents array
    if "documents" in documentData:
        all_sections = []
        for doc in documentData.get("documents", []):
            if isinstance(doc, dict) and "sections" in doc:
                sections = doc.get("sections", [])
                if isinstance(sections, list):
                    all_sections.extend(sections)
        return all_sections

    # Try to extract sections directly from root
    if "sections" in documentData:
        sections = documentData.get("sections", [])
        if isinstance(sections, list):
            return sections

    return []


def extractContentSample(section: Dict[str, Any]) -> str:
    """
    Extract a sample of content from a section for continuation context.
    Returns a string describing the last content for context.
    """
    if not isinstance(section, dict):
        return ""

    content_type = section.get("content_type", "").lower()
    elements = section.get("elements", [])

    if not elements or not isinstance(elements, list):
        return "Content exists"

    # Get last elements for sampling
    sample_elements = elements[-5:] if len(elements) > 5 else elements

    if content_type == "list":
        # Extract last few list items
        items_text = []
        for elem in sample_elements:
            if isinstance(elem, dict) and "text" in elem:
                items_text.append(elem.get("text", ""))
        if items_text:
            return f"Last {len(items_text)} items: {', '.join(items_text[:3])}"

    elif content_type == "paragraph":
        # Extract text and take last 150 chars
        for elem in sample_elements:
            if isinstance(elem, dict) and "text" in elem:
                text = elem.get("text", "")
                if len(text) > 150:
                    text = "..." + text[-150:]
                return f"Last content: {text}"

    elif content_type == "code":
        # Extract last few lines
        for elem in sample_elements:
            if isinstance(elem, dict) and "code" in elem:
                code = elem.get("code", "")
                lines = code.split('\n')
                if len(lines) > 5:
                    return f"Last lines ({len(lines)} total): {', '.join(lines[-3:])}"
                return f"Code ({len(lines)} lines)"

    elif content_type == "table":
        # Extract last rows
        for elem in sample_elements:
            if isinstance(elem, dict) and "rows" in elem:
                rows = elem.get("rows", [])
                return f"Table with {len(rows)} rows"

    return "Content exists"


def _buildDetailedContinuationInfo(section: Dict[str, Any], content_type: str) -> Dict[str, Any]:
    """
    Build detailed continuation information for better AI guidance.
    Completely generic - works for any content type (list, paragraph, code, table, etc.)
    """
    elements = section.get("elements", [])

    if not elements:
        return {
            "type": "continue_general",
            "sample": extractContentSample(section),
            "last_item": "",
            "item_count": 0,
            "guidance": "Continue generating content in the same format and style."
        }

    # Count elements regardless of type
    element_count = len(elements)

    # Extract sample for context - completely generic
    sample = extractContentSample(section)

    # Generic continuation guidance - applies to ANY content type
    # Tell AI to generate ALL REMAINING content to complete the user request
    return {
        "type": "continue_general",
        "sample": sample,
        "last_item": "",
        "item_count": element_count,
        "guidance": "Generate ALL remaining content to complete the user's request. Continue from where you left off and finish everything that was requested."
    }


def _extractLastItemsFromFragment(fragment: str, max_items: int = 10) -> str:
    """
    Extract the last few items from a JSON fragment for continuation context.
    Uses JSON structure (sections -> elements -> items) - fully generic.
    Works with broken/incomplete JSON by trying to parse and extract sections.
    """
    if not fragment:
        return ""

    # Strategy 1: Try to parse as JSON and extract from structure
    try:
        # Try to repair and parse the fragment
        parsed = repairBrokenJson(fragment)
        if parsed:
            # Extract sections from parsed JSON using structure
            sections = extractSectionsFromDocument(parsed)
            if sections:
                # Get the last section (likely where continuation should happen)
                sorted_sections = sorted(sections, key=lambda s: s.get("order", 0))
                last_section = sorted_sections[-1]
                elements = last_section.get("elements", [])

                if elements and isinstance(elements, list):
                    content_type = last_section.get("content_type", "").lower()

                    # For list content_type, extract from items array
                    if content_type == "list" and len(elements) > 0:
                        last_element = elements[-1]
                        if isinstance(last_element, dict):
                            # Check if it has an "items" array (list structure)
                            if "items" in last_element and isinstance(last_element["items"], list):
                                items_list = last_element["items"]
                                if items_list:
                                    # Get last max_items from this items array
                                    last_items = items_list[-max_items:] if len(items_list) > max_items else items_list
                                    # Extract text from each item
                                    texts = []
                                    for item in last_items:
                                        if isinstance(item, dict) and "text" in item:
                                            texts.append(str(item["text"]))
                                    if texts:
                                        return ', '.join(texts)

                            # Or if elements themselves are items (alternative structure)
                            elif "text" in last_element:
                                # Get last max_items elements that have text
                                elements_with_text = [e for e in elements if isinstance(e, dict) and "text" in e]
                                if elements_with_text:
                                    last_elements = elements_with_text[-max_items:] if len(elements_with_text) > max_items else elements_with_text
                                    texts = [str(e.get("text", "")) for e in last_elements]
                                    if texts:
                                        return ', '.join(texts)

                    # For other content types, extract from elements
                    elif len(elements) > 0:
                        # Get last max_items elements that have text/code
                        valid_elements = [e for e in elements if isinstance(e, dict) and ("text" in e or "code" in e)]
                        if valid_elements:
                            last_elements = valid_elements[-max_items:] if len(valid_elements) > max_items else valid_elements
                            texts = []
                            for elem in last_elements:
                                if "text" in elem:
                                    texts.append(str(elem["text"]))
                                elif "code" in elem:
                                    # For code, show snippet
                                    code = str(elem["code"])
                                    texts.append(code[:50] + "..." if len(code) > 50 else code)
                            if texts:
                                return ', '.join(texts)
    except Exception as e:
        logger.debug(f"Could not extract items from fragment using JSON structure: {e}")

    # Strategy 2: If parsing failed, try progressive parsing from the end
    # Look for the last complete JSON structures near the end
    try:
        # Try parsing different lengths from the end
        for length in [3000, 2000, 1000, 500]:
            if len(fragment) > length:
                end_portion = fragment[-length:]
                closed = _closeJsonStructures(end_portion)
                obj, err, _ = tryParseJson(closed)
                if err is None and isinstance(obj, dict):
                    # Successfully parsed - extract sections
                    sections = extractSectionsFromDocument(obj)
                    if sections:
                        # Same extraction logic as above
                        sorted_sections = sorted(sections, key=lambda s: s.get("order", 0))
                        if sorted_sections:
                            last_section = sorted_sections[-1]
                            elements = last_section.get("elements", [])
                            if elements:
                                # Extract texts using same logic as Strategy 1
                                texts = []
                                for elem in elements[-max_items:]:
                                    if isinstance(elem, dict):
                                        if "items" in elem and isinstance(elem["items"], list):
                                            # Get last item from items array
                                            if elem["items"]:
                                                last_item = elem["items"][-1]
                                                if isinstance(last_item, dict) and "text" in last_item:
                                                    texts.append(str(last_item["text"]))
                                        elif "text" in elem:
                                            texts.append(str(elem["text"]))
                                if texts:
                                    return ', '.join(texts[-max_items:])
    except Exception as e:
        logger.debug(f"Progressive parsing from end failed: {e}")

    # Strategy 3: If all parsing fails, try simple extraction from raw fragment
    # Look for last complete {"text": "..."} pattern near the end
    try:
        # Look at last 2000 chars for the pattern
        end_portion = fragment[-2000:] if len(fragment) > 2000 else fragment
        # Find all {"text": "value"} patterns
        import re
        # Pattern to match {"text": "..."} with escaped quotes
        pattern = r'\{"text"\s*:\s*"([^"]+)"\}'
        matches = re.findall(pattern, end_portion)
        if matches:
            # Get last max_items
            last_matches = matches[-max_items:] if len(matches) > max_items else matches
            return ', '.join(last_matches)
    except Exception as e:
        logger.debug(f"Simple pattern extraction failed: {e}")

    # Strategy 4: If all fails, return empty (will use last_item_from_sections)
    return ""


def buildContinuationContext(allSections: List[Dict[str, Any]], lastRawResponse: Optional[str] = None) -> Dict[str, Any]:
    """
    Build context information from accumulated sections for continuation prompt.
    Extracts last items and provides clear continuation point.

    CRITICAL: Analyzes ALL accumulated sections (not just last response) to provide
    accurate progress information to AI. This allows AI to understand completion status
    without seeing the entire content (which would exceed token limits).

    Args:
        allSections: List of ALL sections accumulated across ALL iterations
        lastRawResponse: Raw JSON response from last iteration (can be broken/incomplete)

    Returns:
        Dict with section_count, last_raw_json, last_items, continuation point, and
        PROGRESS STATISTICS from all accumulated sections
    """
    context = {
        "section_count": len(allSections),
    }

    # CRITICAL: Analyze ALL accumulated sections to get accurate progress statistics
    # This allows AI to understand completion status without seeing entire content
    # GENERIC approach: Works for all task types (books, reports, code, lists, etc.)
    totalRows = 0
    totalItems = 0
    totalCodeLines = 0
    totalParagraphs = 0
    totalHeadings = 0
    totalContentSize = 0
    contentTypes = set()
    lastContentType = None

    for section in allSections:
        contentType = section.get("content_type", "")
        contentTypes.add(contentType)
        elements = section.get("elements", [])

        # CRITICAL: Iterate through ALL elements, not just the last one
        # This ensures we count all rows/items/lines from all elements in the section
        if isinstance(elements, list):
            # Multiple elements - iterate through all
            for elem in elements:
                if isinstance(elem, dict):
                    if contentType == "code_block":
                        code = elem.get("code", "")
                        if code:
                            lines = [l for l in code.split('\n') if l.strip()]
                            totalCodeLines += len(lines)
                            totalContentSize += len(code)
                            lastContentType = "code_block"

                    elif contentType == "table":
                        rows = elem.get("rows", [])
                        if isinstance(rows, list):
                            totalRows += len(rows)  # Count ALL rows from ALL table elements
                            totalContentSize += len(str(rows))
                            lastContentType = "table"

                    elif contentType in ["bullet_list", "numbered_list"]:
                        items = elem.get("items", [])
                        if isinstance(items, list):
                            totalItems += len(items)  # Count ALL items from ALL list elements
                            totalContentSize += len(str(items))
                            lastContentType = "list"

                    elif contentType == "heading":
                        text = elem.get("text", "")
                        if text:
                            totalHeadings += 1
                            totalContentSize += len(text)
                            lastContentType = "heading"

                    elif contentType == "paragraph":
                        text = elem.get("text", "")
                        if text:
                            totalParagraphs += 1
                            totalContentSize += len(text)
                            lastContentType = "paragraph"
        elif isinstance(elements, dict):
            # Single element as dict
            elem = elements
            if contentType == "code_block":
                code = elem.get("code", "")
                if code:
                    lines = [l for l in code.split('\n') if l.strip()]
                    totalCodeLines += len(lines)
                    totalContentSize += len(code)
                    lastContentType = "code_block"

            elif contentType == "table":
                rows = elem.get("rows", [])
                if isinstance(rows, list):
                    totalRows += len(rows)
                    totalContentSize += len(str(rows))
                    lastContentType = "table"

            elif contentType in ["bullet_list", "numbered_list"]:
                items = elem.get("items", [])
                if isinstance(items, list):
                    totalItems += len(items)
                    totalContentSize += len(str(items))
                    lastContentType = "list"

            elif contentType == "heading":
                text = elem.get("text", "")
                if text:
                    totalHeadings += 1
                    totalContentSize += len(text)
                    lastContentType = "heading"

            elif contentType == "paragraph":
                text = elem.get("text", "")
                if text:
                    totalParagraphs += 1
                    totalContentSize += len(text)
                    lastContentType = "paragraph"

    # Store progress statistics (not full content - that would exceed token limits)
    # These statistics help AI understand progress for ALL task types
    context["progress_stats"] = {
        "total_rows": totalRows,
        "total_items": totalItems,
        "total_code_lines": totalCodeLines,
        "total_paragraphs": totalParagraphs,
        "total_headings": totalHeadings,
        "total_content_size": totalContentSize,
        "section_count": len(allSections),
        "content_type_count": len(contentTypes),
        "content_types": list(contentTypes),
        "last_content_type": lastContentType
    }

    # Extract last complete sub-item from allSections (already merged, contains all delivered data)
    # Extract cut/incomplete sub-item from raw JSON (what was cut off)
    last_complete_subobject = None
    cut_subobject = None
    content_type_for_items = None
    total_items_count = 0

    # STEP 1: Extract last complete sub-item from allSections (this is what was already delivered)
    if allSections:
        sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0))
        last_section = sorted_sections[-1]
        content_type_for_items = last_section.get("content_type", "")
        elements = last_section.get("elements", [])

        if elements and isinstance(elements, list) and len(elements) > 0:
            last_element = elements[-1]
            if isinstance(last_element, dict):
                # TABLE: Extract last complete row
                if content_type_for_items == "table" and "rows" in last_element:
                    rows = last_element.get("rows", [])
                    if rows and isinstance(rows, list) and len(rows) > 0:
                        total_items_count = len(rows)
                        last_complete_subobject = rows[-1]

                # LIST: Extract last complete item
                elif content_type_for_items in ["bullet_list", "numbered_list"] and "items" in last_element:
                    items = last_element.get("items", [])
                    if items and isinstance(items, list) and len(items) > 0:
                        total_items_count = len(items)
                        last_complete_subobject = items[-1]

                # CODE_BLOCK: Extract last complete line
                elif content_type_for_items == "code_block" and "code" in last_element:
                    code = last_element.get("code", "")
                    if code:
                        lines = [l for l in code.split('\n') if l.strip()]
                        total_items_count = len(lines)
                        if lines:
                            last_complete_subobject = lines[-1]

                # PARAGRAPH/HEADING: Extract last complete sentence
                elif content_type_for_items in ["paragraph", "heading"] and "text" in last_element:
                    text = last_element.get("text", "")
                    if text:
                        import re
                        sentences = re.split(r'([.!?]+)', text)
                        complete_sentences = []
                        for i in range(0, len(sentences) - 1, 2):
                            if i + 1 < len(sentences):
                                complete_sentences.append(sentences[i] + sentences[i + 1])
                        total_items_count = len(complete_sentences)
                        if complete_sentences:
                            last_complete_subobject = complete_sentences[-1]

    # STEP 2: Extract cut/incomplete sub-item from raw JSON (what was cut off)
    if lastRawResponse:
        raw_json = stripCodeFences(lastRawResponse.strip())
        if raw_json and raw_json.strip() != "{}":
            try:
                import re
                if content_type_for_items == "code_block":
                    # Find incomplete code line at the end
                    # Look for code string that doesn't end with closing quote
                    code_match = re.search(r'"code"\s*:\s*"([^"]*?)(?:"|$)', raw_json)
                    if code_match:
                        code_content = code_match.group(1)
                        try:
                            code_content = json.loads('"' + code_content + '"')
                        except:
                            pass
                        lines = code_content.split('\n')
                        if lines and not raw_json.rstrip().endswith('"'):
                            # Code string is incomplete - last line is cut
                            cut_subobject = lines[-1] if lines else None
                elif content_type_for_items == "table":
                    # Find incomplete row at the end
                    row_pattern = r'\["([^"]*)"(?:,\s*"([^"]*)")*'
                    matches = list(re.finditer(row_pattern, raw_json))
                    if matches:
                        last_match = matches[-1]
                        end_pos = last_match.end()
                        if end_pos < len(raw_json):
                            remaining = raw_json[end_pos:end_pos+20].strip()
                            if not remaining.startswith(']'):
                                # Row is incomplete - extract values
                                cut_values = re.findall(r'"([^"]*)"', raw_json[last_match.start():last_match.end()])
                                if cut_values:
                                    cut_subobject = cut_values
                elif content_type_for_items in ["bullet_list", "numbered_list"]:
                    # Find incomplete item at the end
                    item_pattern = r'"([^"]*)"'
                    matches = list(re.finditer(item_pattern, raw_json))
                    if matches:
                        last_match = matches[-1]
                        end_pos = last_match.end()
                        if end_pos < len(raw_json):
                            remaining = raw_json[end_pos:end_pos+10].strip()
                            if remaining and remaining[0] not in [',', ']', '}', '"']:
                                cut_subobject = last_match.group(1)
            except Exception as e:
                logger.debug(f"Could not extract cut sub-object from raw JSON: {e}")

            context["last_raw_json"] = raw_json
        else:
            context["last_raw_json"] = ""
    else:
        context["last_raw_json"] = ""

    # Convert to JSON strings
    if last_complete_subobject is not None:
        try:
            last_complete_subobject = json.dumps(last_complete_subobject)
        except:
            last_complete_subobject = str(last_complete_subobject)

    if cut_subobject is not None:
        try:
            cut_subobject = json.dumps(cut_subobject)
        except:
            cut_subobject = str(cut_subobject)

    context["last_item_object"] = last_complete_subobject if last_complete_subobject else ""
    context["cut_item_object"] = cut_subobject if cut_subobject else None
    context["content_type_for_items"] = content_type_for_items
    context["total_items_count"] = total_items_count

    return context


def parseJsonWithModel(jsonString: str, modelClass: Type[T]) -> T:
    """
    Parse JSON string using Pydantic model with error handling.

    Uses existing jsonUtils methods:
    - extractJsonString() - Extracts JSON from text with code fences
    - tryParseJson() - Safe parsing with error handling
    - repairBrokenJson() - Repairs broken/incomplete JSON

    Args:
        jsonString: JSON string to parse (may contain code fences, extra text, etc.)
        modelClass: Pydantic model class to parse into

    Returns:
        Parsed Pydantic model instance

    Raises:
        ValueError: If JSON cannot be parsed or validated
    """
    if not jsonString:
        raise ValueError(f"Cannot parse empty JSON string for {modelClass.__name__}")

    # Step 1: Extract JSON string (handles code fences, extra text)
    extractedJson = extractJsonString(jsonString)

    if not extractedJson or extractedJson.strip() == "":
        raise ValueError(f"No JSON found in string for {modelClass.__name__}")

    # Step 2: Try to parse as JSON
    parsedJson, error, cleaned = tryParseJson(extractedJson)

    if error is None and parsedJson is not None:
        # Successfully parsed - try to create model
        try:
            if isinstance(parsedJson, dict):
                return modelClass(**parsedJson)
            elif isinstance(parsedJson, list):
                # If model expects a list, try to parse first item
                if parsedJson:
                    return modelClass(**parsedJson[0])
                else:
                    raise ValueError(f"Empty list cannot be parsed as {modelClass.__name__}")
            else:
                raise ValueError(f"Parsed JSON is not a dict or list: {type(parsedJson)}")
        except ValidationError as e:
            logger.error(f"Validation error parsing {modelClass.__name__}: {e}")
            raise ValueError(f"Invalid data for {modelClass.__name__}: {e}")
        except Exception as e:
            logger.error(f"Error creating {modelClass.__name__} instance: {e}")
            raise ValueError(f"Failed to create {modelClass.__name__} instance: {e}")

    # Step 3: Try to repair broken JSON
    logger.warning(f"Initial JSON parsing failed, attempting repair for {modelClass.__name__}")
    repairedJson = repairBrokenJson(extractedJson)

    if repairedJson:
        # Try parsing repaired JSON
        parsedRepaired, errorRepaired, _ = tryParseJson(json.dumps(repairedJson))

        if errorRepaired is None and parsedRepaired is not None:
            try:
                if isinstance(parsedRepaired, dict):
                    return modelClass(**parsedRepaired)
                elif isinstance(parsedRepaired, list) and parsedRepaired:
                    return modelClass(**parsedRepaired[0])
            except ValidationError as e:
                logger.error(f"Validation error parsing repaired {modelClass.__name__}: {e}")
                raise ValueError(f"Invalid repaired data for {modelClass.__name__}: {e}")
            except Exception as e:
                logger.error(f"Error creating {modelClass.__name__} from repaired JSON: {e}")

    # Step 4: All parsing failed
    logger.error(f"Failed to parse JSON for {modelClass.__name__}. Cleaned JSON preview: {cleaned[:200]}...")
    raise ValueError(f"Failed to parse or validate JSON for {modelClass.__name__}. JSON may be malformed or incomplete.")