gateway/modules/shared/jsonUtils.py

import json
import logging
from typing import Any, Dict, List, Optional, Tuple, Union

logger = logging.getLogger(__name__)


def stripCodeFences(text: str) -> str:
    """Remove ```json / ``` fences and surrounding whitespace if present."""
    if not text:
        return text
    s = text.strip()
    if s.startswith("```") and s.endswith("```"):
        # Remove first/last triple backticks
        # Commonly starts with ```json\n
        # Strip opening backticks
        i = 3
        # Skip optional language tag like 'json'
        while i < len(s) and s[i] != '\n':
            i += 1
        if i < len(s) and s[i] == '\n':
            s = s[i+1:]
        # Strip trailing ```
        if s.endswith("```"):
            s = s[:-3]
        return s.strip()
    return s


def extractFirstBalancedJson(text: str) -> str:
    """Return the first balanced JSON object/array substring; otherwise return trimmed input."""
    if not text:
        return text
    s = text.strip()
    # Find first '{' or '['
    brace = s.find('{')
    bracket = s.find('[')
    start = -1
    if brace != -1 and (bracket == -1 or brace < bracket):
        start = brace
    elif bracket != -1:
        start = bracket
    if start == -1:
        return s
    # Scan for matching close using a simple stack
    stack: List[str] = []
    for i in range(start, len(s)):
        ch = s[i]
        if ch in '{[':
            stack.append(ch)
        elif ch in '}]':
            if not stack:
                continue
            opener = stack.pop()
            if (opener == '{' and ch != '}') or (opener == '[' and ch != ']'):
                continue
            if not stack:
                return s[start:i+1].strip()
    return s


def normalizeJsonText(text: str) -> str:
    """Light normalization: remove BOM, normalize smart quotes."""
    if not text:
        return text
    s = text
    # Remove UTF-8 BOM if present
    if s.startswith('\ufeff'):
        s = s.lstrip('\ufeff')
    # Normalize smart quotes to straight quotes
    s = s.replace('“', '"').replace('”', '"').replace('’', "'").replace('‘', "'")
    return s


def extractJsonString(text: str) -> str:
    """Strip code fences, normalize, then extract first balanced JSON substring."""
    s = normalizeJsonText(text)
    s = stripCodeFences(s)
    s = extractFirstBalancedJson(s)
    return s.strip()


def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]], Optional[Exception], str]:
    """Extract and parse JSON; return (obj, error, cleaned_str)."""
    if isinstance(text, bytes):
        try:
            text = text.decode('utf-8', errors='replace')
        except Exception:
            text = str(text)
    cleaned = extractJsonString(text or "")
    try:
        return json.loads(cleaned), None, cleaned
    except Exception as e:
        return None, e, cleaned


def parseJsonOrRaise(text: Union[str, bytes]) -> Union[Dict, List]:
    obj, err, cleaned = tryParseJson(text)
    if err is not None:
        logger.error(f"parse_json_or_raise failed: {err}. Cleaned preview: {cleaned[:200]}...")
        raise err
    return obj


def mergeRootLists(json_parts: List[Union[str, Dict, List]]) -> Dict[str, Any]:
    """
    Generic merger for root-level lists: take first dict as base; for each subsequent part:
    - if value is list and same key exists as list, extend it
    - if key absent, add it
    - for non-list keys, keep the original (from the first part)
    Sets continuation=None if present in base.
    """
    base: Optional[Dict[str, Any]] = None
    parsed: List[Dict[str, Any]] = []
    for part in json_parts:
        if isinstance(part, (dict, list)):
            obj = part
        else:
            obj, err, _ = tryParseJson(part)
            if err is not None or not isinstance(obj, (dict, list)):
                continue
        if isinstance(obj, dict):
            parsed.append(obj)
    if not parsed:
        return {}
    base = dict(parsed[0])
    for obj in parsed[1:]:
        for k, v in obj.items():
            if isinstance(v, list) and isinstance(base.get(k), list):
                base[k].extend(v)
            elif k not in base:
                base[k] = v
    if 'continuation' in base:
        base['continuation'] = None
    return base


def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]:
    """
    Attempt to repair broken JSON using multiple strategies.
    Returns the best repair attempt or None if all fail.
    """
    if not text:
        return None

    # Strategy 1: Progressive parsing - try to find longest valid prefix
    best_result = None
    best_valid_length = 0

    for i in range(len(text), 0, -1):
        test_str = text[:i]
        closed_str = _closeJsonStructures(test_str)
        obj, err, _ = tryParseJson(closed_str)
        if err is None and isinstance(obj, dict):
            best_result = obj
            best_valid_length = i
            logger.debug(f"Progressive parsing success at length {i}")
            break

    if best_result:
        logger.info(f"Repaired JSON using progressive parsing (valid length: {best_valid_length})")
        return best_result

    # Strategy 2: Structure closing - close incomplete structures
    closed_str = _closeJsonStructures(text)
    obj, err, _ = tryParseJson(closed_str)
    if err is None and isinstance(obj, dict):
        logger.info("Repaired JSON using structure closing")
        return obj

    # Strategy 3: Regex extraction (fallback for completely broken JSON)
    extracted = _extractSectionsRegex(text)
    if extracted:
        logger.info("Repaired JSON using regex extraction")
        return {"documents": [{"sections": extracted}]}

    logger.warning("All repair strategies failed")
    return None


def _closeJsonStructures(text: str) -> str:
    """
    Close incomplete JSON structures by adding missing closing brackets.
    """
    if not text:
        return text

    # Count open/close brackets and braces
    open_braces = text.count('{')
    close_braces = text.count('}')
    open_brackets = text.count('[')
    close_brackets = text.count(']')

    # Close incomplete structures
    result = text
    for _ in range(open_braces - close_braces):
        result += '}'
    for _ in range(open_brackets - close_brackets):
        result += ']'

    return result


def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]:
    """
    Extract sections from broken JSON using regex patterns.
    Fallback strategy when JSON is completely corrupted.
    """
    import re

    sections = []

    # Pattern to find section objects
    section_pattern = r'"id"\s*:\s*"(section_\d+)"\s*,?\s*"content_type"\s*:\s*"(\w+)"\s*,?\s*"order"\s*:\s*(\d+)'

    for match in re.finditer(section_pattern, text, re.IGNORECASE):
        section_id = match.group(1)
        content_type = match.group(2)
        order = int(match.group(3))

        # Try to extract elements array
        elements_match = re.search(
            r'"elements"\s*:\s*\[(.*?)\]',
            text[match.end():match.end()+500]  # Look ahead for elements
        )

        elements = []
        if elements_match:
            try:
                elements_str = '[' + elements_match.group(1) + ']'
                elements = json.loads(elements_str)
            except:
                pass

        sections.append({
            "id": section_id,
            "content_type": content_type,
            "elements": elements,
            "order": order
        })

    return sections


def extractSectionsFromDocument(documentData: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extract all sections from document data structure.
    Handles both flat and nested document structures.
    """
    if not isinstance(documentData, dict):
        return []

    # Try to extract sections from documents array
    if "documents" in documentData:
        all_sections = []
        for doc in documentData.get("documents", []):
            if isinstance(doc, dict) and "sections" in doc:
                sections = doc.get("sections", [])
                if isinstance(sections, list):
                    all_sections.extend(sections)
        return all_sections

    # Try to extract sections directly from root
    if "sections" in documentData:
        sections = documentData.get("sections", [])
        if isinstance(sections, list):
            return sections

    return []


def extractContentSample(section: Dict[str, Any]) -> str:
    """
    Extract a sample of content from a section for continuation context.
    Returns a string describing the last content for context.
    """
    if not isinstance(section, dict):
        return ""

    content_type = section.get("content_type", "").lower()
    elements = section.get("elements", [])

    if not elements or not isinstance(elements, list):
        return "Content exists"

    # Get last elements for sampling
    sample_elements = elements[-5:] if len(elements) > 5 else elements

    if content_type == "list":
        # Extract last few list items
        items_text = []
        for elem in sample_elements:
            if isinstance(elem, dict) and "text" in elem:
                items_text.append(elem.get("text", ""))
        if items_text:
            return f"Last {len(items_text)} items: {', '.join(items_text[:3])}"

    elif content_type == "paragraph":
        # Extract text and take last 150 chars
        for elem in sample_elements:
            if isinstance(elem, dict) and "text" in elem:
                text = elem.get("text", "")
                if len(text) > 150:
                    text = "..." + text[-150:]
                return f"Last content: {text}"

    elif content_type == "code":
        # Extract last few lines
        for elem in sample_elements:
            if isinstance(elem, dict) and "code" in elem:
                code = elem.get("code", "")
                lines = code.split('\n')
                if len(lines) > 5:
                    return f"Last lines ({len(lines)} total): {', '.join(lines[-3:])}"
                return f"Code ({len(lines)} lines)"

    elif content_type == "table":
        # Extract last rows
        for elem in sample_elements:
            if isinstance(elem, dict) and "rows" in elem:
                rows = elem.get("rows", [])
                return f"Table with {len(rows)} rows"

    return "Content exists"


def buildContinuationContext(allSections: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Build context information from accumulated sections for continuation prompt.
    Returns dict with metadata about what was already generated.
    """
    if not allSections:
        return {
            "section_count": 0,
            "next_order": 1,
            "last_content_sample": "No content yet"
        }

    # Sort sections by order
    sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0))

    last_section = sorted_sections[-1]
    last_order = last_section.get("order", 0)

    # Get content sample from last section
    last_content_sample = extractContentSample(last_section)

    return {
        "section_count": len(allSections),
        "last_section_id": last_section.get("id", ""),
        "last_order": last_order,
        "next_order": last_order + 1,
        "last_content_type": last_section.get("content_type", ""),
        "last_content_sample": last_content_sample
    }