import json import logging from typing import Any, Dict, List, Optional, Tuple, Union logger = logging.getLogger(__name__) def stripCodeFences(text: str) -> str: """Remove ```json / ``` fences and surrounding whitespace if present.""" if not text: return text s = text.strip() if s.startswith("```") and s.endswith("```"): # Remove first/last triple backticks # Commonly starts with ```json\n # Strip opening backticks i = 3 # Skip optional language tag like 'json' while i < len(s) and s[i] != '\n': i += 1 if i < len(s) and s[i] == '\n': s = s[i+1:] # Strip trailing ``` if s.endswith("```"): s = s[:-3] return s.strip() return s def extractFirstBalancedJson(text: str) -> str: """Return the first balanced JSON object/array substring; otherwise return trimmed input.""" if not text: return text s = text.strip() # Find first '{' or '[' brace = s.find('{') bracket = s.find('[') start = -1 if brace != -1 and (bracket == -1 or brace < bracket): start = brace elif bracket != -1: start = bracket if start == -1: return s # Scan for matching close using a simple stack stack: List[str] = [] for i in range(start, len(s)): ch = s[i] if ch in '{[': stack.append(ch) elif ch in '}]': if not stack: continue opener = stack.pop() if (opener == '{' and ch != '}') or (opener == '[' and ch != ']'): continue if not stack: return s[start:i+1].strip() return s def normalizeJsonText(text: str) -> str: """Light normalization: remove BOM, normalize smart quotes.""" if not text: return text s = text # Remove UTF-8 BOM if present if s.startswith('\ufeff'): s = s.lstrip('\ufeff') # Normalize smart quotes to straight quotes s = s.replace('“', '"').replace('”', '"').replace('’', "'").replace('‘', "'") return s def extractJsonString(text: str) -> str: """Strip code fences, normalize, then extract first balanced JSON substring.""" s = normalizeJsonText(text) s = stripCodeFences(s) s = extractFirstBalancedJson(s) return s.strip() def tryParseJson(text: Union[str, bytes]) -> Tuple[Optional[Union[Dict, List]], Optional[Exception], str]: """Extract and parse JSON; return (obj, error, cleaned_str).""" if isinstance(text, bytes): try: text = text.decode('utf-8', errors='replace') except Exception: text = str(text) cleaned = extractJsonString(text or "") try: return json.loads(cleaned), None, cleaned except Exception as e: return None, e, cleaned def parseJsonOrRaise(text: Union[str, bytes]) -> Union[Dict, List]: obj, err, cleaned = tryParseJson(text) if err is not None: logger.error(f"parse_json_or_raise failed: {err}. Cleaned preview: {cleaned[:200]}...") raise err return obj def mergeRootLists(json_parts: List[Union[str, Dict, List]]) -> Dict[str, Any]: """ Generic merger for root-level lists: take first dict as base; for each subsequent part: - if value is list and same key exists as list, extend it - if key absent, add it - for non-list keys, keep the original (from the first part) Sets continuation=None if present in base. """ base: Optional[Dict[str, Any]] = None parsed: List[Dict[str, Any]] = [] for part in json_parts: if isinstance(part, (dict, list)): obj = part else: obj, err, _ = tryParseJson(part) if err is not None or not isinstance(obj, (dict, list)): continue if isinstance(obj, dict): parsed.append(obj) if not parsed: return {} base = dict(parsed[0]) for obj in parsed[1:]: for k, v in obj.items(): if isinstance(v, list) and isinstance(base.get(k), list): base[k].extend(v) elif k not in base: base[k] = v if 'continuation' in base: base['continuation'] = None return base def repairBrokenJson(text: str) -> Optional[Dict[str, Any]]: """ Attempt to repair broken JSON using multiple strategies. Returns the best repair attempt or None if all fail. """ if not text: return None # Strategy 1: Progressive parsing - try to find longest valid prefix best_result = None best_valid_length = 0 for i in range(len(text), 0, -1): test_str = text[:i] closed_str = _closeJsonStructures(test_str) obj, err, _ = tryParseJson(closed_str) if err is None and isinstance(obj, dict): best_result = obj best_valid_length = i logger.debug(f"Progressive parsing success at length {i}") break if best_result: logger.info(f"Repaired JSON using progressive parsing (valid length: {best_valid_length})") return best_result # Strategy 2: Structure closing - close incomplete structures closed_str = _closeJsonStructures(text) obj, err, _ = tryParseJson(closed_str) if err is None and isinstance(obj, dict): logger.info("Repaired JSON using structure closing") return obj # Strategy 3: Regex extraction (fallback for completely broken JSON) extracted = _extractSectionsRegex(text) if extracted: logger.info("Repaired JSON using regex extraction") return {"documents": [{"sections": extracted}]} logger.warning("All repair strategies failed") return None def _closeJsonStructures(text: str) -> str: """ Close incomplete JSON structures by adding missing closing brackets. """ if not text: return text # Count open/close brackets and braces open_braces = text.count('{') close_braces = text.count('}') open_brackets = text.count('[') close_brackets = text.count(']') # Close incomplete structures result = text for _ in range(open_braces - close_braces): result += '}' for _ in range(open_brackets - close_brackets): result += ']' return result def _extractSectionsRegex(text: str) -> List[Dict[str, Any]]: """ Extract sections from broken JSON using regex patterns. Fallback strategy when JSON is completely corrupted. """ import re sections = [] # Pattern to find section objects section_pattern = r'"id"\s*:\s*"(section_\d+)"\s*,?\s*"content_type"\s*:\s*"(\w+)"\s*,?\s*"order"\s*:\s*(\d+)' for match in re.finditer(section_pattern, text, re.IGNORECASE): section_id = match.group(1) content_type = match.group(2) order = int(match.group(3)) # Try to extract elements array elements_match = re.search( r'"elements"\s*:\s*\[(.*?)\]', text[match.end():match.end()+500] # Look ahead for elements ) elements = [] if elements_match: try: elements_str = '[' + elements_match.group(1) + ']' elements = json.loads(elements_str) except: pass sections.append({ "id": section_id, "content_type": content_type, "elements": elements, "order": order }) return sections def extractSectionsFromDocument(documentData: Dict[str, Any]) -> List[Dict[str, Any]]: """ Extract all sections from document data structure. Handles both flat and nested document structures. """ if not isinstance(documentData, dict): return [] # Try to extract sections from documents array if "documents" in documentData: all_sections = [] for doc in documentData.get("documents", []): if isinstance(doc, dict) and "sections" in doc: sections = doc.get("sections", []) if isinstance(sections, list): all_sections.extend(sections) return all_sections # Try to extract sections directly from root if "sections" in documentData: sections = documentData.get("sections", []) if isinstance(sections, list): return sections return [] def extractContentSample(section: Dict[str, Any]) -> str: """ Extract a sample of content from a section for continuation context. Returns a string describing the last content for context. """ if not isinstance(section, dict): return "" content_type = section.get("content_type", "").lower() elements = section.get("elements", []) if not elements or not isinstance(elements, list): return "Content exists" # Get last elements for sampling sample_elements = elements[-5:] if len(elements) > 5 else elements if content_type == "list": # Extract last few list items items_text = [] for elem in sample_elements: if isinstance(elem, dict) and "text" in elem: items_text.append(elem.get("text", "")) if items_text: return f"Last {len(items_text)} items: {', '.join(items_text[:3])}" elif content_type == "paragraph": # Extract text and take last 150 chars for elem in sample_elements: if isinstance(elem, dict) and "text" in elem: text = elem.get("text", "") if len(text) > 150: text = "..." + text[-150:] return f"Last content: {text}" elif content_type == "code": # Extract last few lines for elem in sample_elements: if isinstance(elem, dict) and "code" in elem: code = elem.get("code", "") lines = code.split('\n') if len(lines) > 5: return f"Last lines ({len(lines)} total): {', '.join(lines[-3:])}" return f"Code ({len(lines)} lines)" elif content_type == "table": # Extract last rows for elem in sample_elements: if isinstance(elem, dict) and "rows" in elem: rows = elem.get("rows", []) return f"Table with {len(rows)} rows" return "Content exists" def buildContinuationContext(allSections: List[Dict[str, Any]]) -> Dict[str, Any]: """ Build context information from accumulated sections for continuation prompt. Returns dict with metadata about what was already generated. """ if not allSections: return { "section_count": 0, "next_order": 1, "last_content_sample": "No content yet" } # Sort sections by order sorted_sections = sorted(allSections, key=lambda s: s.get("order", 0)) last_section = sorted_sections[-1] last_order = last_section.get("order", 0) # Get content sample from last section last_content_sample = extractContentSample(last_section) return { "section_count": len(allSections), "last_section_id": last_section.get("id", ""), "last_order": last_order, "next_order": last_order + 1, "last_content_type": last_section.get("content_type", ""), "last_content_sample": last_content_sample }