full dynamic document extraction - processing - generation
This commit is contained in:
parent
be5f6773b6
commit
1aecec9d61
5 changed files with 295 additions and 70 deletions
|
|
@ -1515,68 +1515,70 @@ class AiService:
|
||||||
title = "AI Generated Document"
|
title = "AI Generated Document"
|
||||||
|
|
||||||
# Get format-specific extraction prompt
|
# Get format-specific extraction prompt
|
||||||
extraction_prompt = generation_service.getExtractionPrompt(
|
extractionPrompt = await generation_service.getExtractionPrompt(
|
||||||
output_format=outputFormat,
|
outputFormat=outputFormat,
|
||||||
user_prompt=prompt,
|
userPrompt=prompt,
|
||||||
title=title
|
title=title,
|
||||||
|
aiService=self
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process documents with format-specific prompt using CLEAN mode
|
# Process documents with format-specific prompt using CLEAN mode
|
||||||
# This ensures no debug metadata is included in the final output
|
# This ensures no debug metadata is included in the final output
|
||||||
ai_response = await self._callAiTextClean(extraction_prompt, documents, options)
|
aiResponse = await self._callAiTextClean(extractionPrompt, documents, options)
|
||||||
|
|
||||||
# Parse filename header from AI response if present
|
# Parse filename header from AI response if present
|
||||||
parsed_filename = None
|
parsedFilename = None
|
||||||
try:
|
try:
|
||||||
if ai_response:
|
if aiResponse:
|
||||||
first_newline = ai_response.find('\n')
|
firstNewline = aiResponse.find('\n')
|
||||||
header_line = ai_response if first_newline == -1 else ai_response[:first_newline]
|
headerLine = aiResponse if firstNewline == -1 else aiResponse[:firstNewline]
|
||||||
if header_line.strip().lower().startswith('filename:'):
|
if headerLine.strip().lower().startswith('filename:'):
|
||||||
parsed = header_line.split(':', 1)[1].strip()
|
parsed = headerLine.split(':', 1)[1].strip()
|
||||||
# basic sanitization
|
# basic sanitization
|
||||||
import re
|
import re
|
||||||
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
|
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", parsed)
|
||||||
parsed = re.sub(r"-+", "-", parsed).strip('-')
|
parsed = re.sub(r"-+", "-", parsed).strip('-')
|
||||||
if parsed:
|
if parsed:
|
||||||
parsed_filename = parsed
|
parsedFilename = parsed
|
||||||
# remove header line from content for rendering
|
# remove header line from content for rendering
|
||||||
ai_response = ai_response[first_newline+1:].lstrip('\n') if first_newline != -1 else ''
|
aiResponse = aiResponse[firstNewline+1:].lstrip('\n') if firstNewline != -1 else ''
|
||||||
except Exception:
|
except Exception:
|
||||||
parsed_filename = None
|
parsedFilename = None
|
||||||
|
|
||||||
if not ai_response or ai_response.strip() == "":
|
if not aiResponse or aiResponse.strip() == "":
|
||||||
raise Exception("AI content generation failed")
|
raise Exception("AI content generation failed")
|
||||||
|
|
||||||
# Render the content to the specified format
|
# Render the content to the specified format
|
||||||
rendered_content, mime_type = await generation_service.renderReport(
|
renderedContent, mimeType = await generation_service.renderReport(
|
||||||
extracted_content=ai_response,
|
extractedContent=aiResponse,
|
||||||
output_format=outputFormat,
|
outputFormat=outputFormat,
|
||||||
title=title,
|
title=title,
|
||||||
user_prompt=prompt
|
userPrompt=prompt,
|
||||||
|
aiService=self
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate meaningful filename (use AI-provided if valid, else fallback)
|
# Generate meaningful filename (use AI-provided if valid, else fallback)
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
||||||
if parsed_filename and parsed_filename.lower().endswith(f".{outputFormat.lower()}"):
|
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
|
||||||
filename = parsed_filename
|
filename = parsedFilename
|
||||||
else:
|
else:
|
||||||
safe_title = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
|
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
|
||||||
filename = f"{safe_title or 'document'}-{timestamp}.{outputFormat}"
|
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
|
||||||
|
|
||||||
# Return structured result with document information
|
# Return structured result with document information
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"content": ai_response, # Raw AI response
|
"content": aiResponse, # Raw AI response
|
||||||
"rendered_content": rendered_content, # Formatted content
|
"rendered_content": renderedContent, # Formatted content
|
||||||
"mime_type": mime_type,
|
"mime_type": mimeType,
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
"format": outputFormat,
|
"format": outputFormat,
|
||||||
"title": title,
|
"title": title,
|
||||||
"documents": [{
|
"documents": [{
|
||||||
"documentName": filename,
|
"documentName": filename,
|
||||||
"documentData": rendered_content,
|
"documentData": renderedContent,
|
||||||
"mimeType": mime_type
|
"mimeType": mimeType
|
||||||
}]
|
}]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -296,14 +296,16 @@ class GenerationService:
|
||||||
'workflowId': 'unknown'
|
'workflowId': 'unknown'
|
||||||
}
|
}
|
||||||
|
|
||||||
async def renderReport(self, extracted_content: str, output_format: str, title: str, user_prompt: str = None) -> tuple[str, str]:
|
async def renderReport(self, extractedContent: str, outputFormat: str, title: str, userPrompt: str = None, aiService=None) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Render extracted content to the specified output format.
|
Render extracted content to the specified output format.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
extracted_content: Content extracted by AI using format-specific prompt
|
extractedContent: Content extracted by AI using format-specific prompt
|
||||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||||
title: Report title
|
title: Report title
|
||||||
|
userPrompt: User's original prompt for report generation
|
||||||
|
aiService: AI service instance for generation prompt creation
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (rendered_content, mime_type)
|
tuple: (rendered_content, mime_type)
|
||||||
|
|
@ -317,66 +319,83 @@ class GenerationService:
|
||||||
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
debug_dir = os.path.join(debug_root, f"render_input_{ts}")
|
||||||
os.makedirs(debug_dir, exist_ok=True)
|
os.makedirs(debug_dir, exist_ok=True)
|
||||||
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_dir, "meta.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(f"title: {title}\nformat: {output_format}\nlength: {len(extracted_content or '')}\nstarts_with_brace: {str(extracted_content.strip().startswith('{') if extracted_content else False)}\n")
|
f.write(f"title: {title}\nformat: {outputFormat}\nlength: {len(extractedContent or '')}\nstarts_with_brace: {str(extractedContent.strip().startswith('{') if extractedContent else False)}\n")
|
||||||
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_dir, "extracted_content.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(extracted_content or "")
|
f.write(extractedContent or "")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Get the appropriate renderer for the format
|
# Get the appropriate renderer for the format
|
||||||
renderer = self._getFormatRenderer(output_format)
|
renderer = self._getFormatRenderer(outputFormat)
|
||||||
if not renderer:
|
if not renderer:
|
||||||
raise ValueError(f"Unsupported output format: {output_format}")
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||||
|
|
||||||
# Render the content with user prompt for structure
|
# Generate AI-based generation prompt if AI service is available
|
||||||
rendered_content, mime_type = await renderer.render(extracted_content, title, user_prompt)
|
generationPrompt = userPrompt # Default to user prompt
|
||||||
|
if aiService and userPrompt:
|
||||||
|
try:
|
||||||
|
from .prompt_builder import buildGenerationPrompt
|
||||||
|
generationPrompt = await buildGenerationPrompt(
|
||||||
|
outputFormat=outputFormat,
|
||||||
|
userPrompt=userPrompt,
|
||||||
|
title=title,
|
||||||
|
aiService=aiService
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to generate AI-based generation prompt: {str(e)}, using user prompt")
|
||||||
|
generationPrompt = userPrompt
|
||||||
|
|
||||||
|
# Render the content with AI-generated prompt
|
||||||
|
renderedContent, mimeType = await renderer.render(extractedContent, title, generationPrompt)
|
||||||
# DEBUG: dump rendered output
|
# DEBUG: dump rendered output
|
||||||
try:
|
try:
|
||||||
import os
|
import os
|
||||||
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(debug_dir, "rendered_output.txt"), "w", encoding="utf-8") as f:
|
||||||
f.write(rendered_content or "")
|
f.write(renderedContent or "")
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logger.info(f"Successfully rendered report to {output_format} format: {len(rendered_content)} characters")
|
logger.info(f"Successfully rendered report to {outputFormat} format: {len(renderedContent)} characters")
|
||||||
return rendered_content, mime_type
|
return renderedContent, mimeType
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error rendering report to {output_format}: {str(e)}")
|
logger.error(f"Error rendering report to {outputFormat}: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def getExtractionPrompt(self, output_format: str, user_prompt: str, title: str) -> str:
|
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
|
||||||
"""
|
"""
|
||||||
Get the format-specific extraction prompt for AI content extraction.
|
Get the format-specific extraction prompt for AI content extraction.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_format: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
outputFormat: Target format (html, pdf, docx, txt, md, json, csv, xlsx)
|
||||||
user_prompt: User's original prompt for report generation
|
userPrompt: User's original prompt for report generation
|
||||||
title: Report title
|
title: Report title
|
||||||
|
aiService: AI service instance for intent extraction
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Format-specific prompt for AI extraction
|
str: Format-specific prompt for AI extraction
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Get the appropriate renderer for the format
|
# Get the appropriate renderer for the format
|
||||||
renderer = self._getFormatRenderer(output_format)
|
renderer = self._getFormatRenderer(outputFormat)
|
||||||
if not renderer:
|
if not renderer:
|
||||||
raise ValueError(f"Unsupported output format: {output_format}")
|
raise ValueError(f"Unsupported output format: {outputFormat}")
|
||||||
|
|
||||||
# Build centralized prompt with generic rules + format-specific guidelines
|
# Build centralized prompt with generic rules + format-specific guidelines
|
||||||
from .prompt_builder import buildExtractionPrompt
|
from .prompt_builder import buildExtractionPrompt
|
||||||
extraction_prompt = buildExtractionPrompt(
|
extractionPrompt = await buildExtractionPrompt(
|
||||||
output_format=output_format,
|
outputFormat=outputFormat,
|
||||||
renderer=renderer,
|
renderer=renderer,
|
||||||
user_prompt=user_prompt,
|
userPrompt=userPrompt,
|
||||||
title=title
|
title=title,
|
||||||
|
aiService=aiService
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Generated {output_format}-specific extraction prompt: {len(extraction_prompt)} characters")
|
logger.info(f"Generated {outputFormat}-specific extraction prompt: {len(extractionPrompt)} characters")
|
||||||
return extraction_prompt
|
return extractionPrompt
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error getting extraction prompt for {output_format}: {str(e)}")
|
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def _getFormatRenderer(self, output_format: str):
|
def _getFormatRenderer(self, output_format: str):
|
||||||
|
|
|
||||||
|
|
@ -16,15 +16,16 @@ class _RendererLike(Protocol):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
def buildExtractionPrompt(
|
async def buildExtractionPrompt(
|
||||||
output_format: str,
|
outputFormat: str,
|
||||||
renderer: _RendererLike,
|
renderer: _RendererLike,
|
||||||
user_prompt: str,
|
userPrompt: str,
|
||||||
title: str
|
title: str,
|
||||||
|
aiService=None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Build the final extraction prompt by combining:
|
Build the final extraction prompt by combining:
|
||||||
- The raw user prompt (verbatim)
|
- Parsed extraction intent from user prompt (using AI)
|
||||||
- Generic cross-format instructions (filename header + real-data policy)
|
- Generic cross-format instructions (filename header + real-data policy)
|
||||||
- Format-specific guidelines snippet provided by the renderer
|
- Format-specific guidelines snippet provided by the renderer
|
||||||
|
|
||||||
|
|
@ -33,13 +34,16 @@ def buildExtractionPrompt(
|
||||||
followed by a blank line and then ONLY the document content according to the target format.
|
followed by a blank line and then ONLY the document content according to the target format.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
format_guidelines = renderer.getExtractionPrompt(user_prompt, title)
|
# Parse user prompt to separate extraction intent from generation format using AI
|
||||||
|
extractionIntent = await _parseExtractionIntent(userPrompt, outputFormat, aiService)
|
||||||
|
|
||||||
|
formatGuidelines = renderer.getExtractionPrompt(userPrompt, title)
|
||||||
|
|
||||||
# Generic block appears once for every format
|
# Generic block appears once for every format
|
||||||
generic_intro = f"""
|
genericIntro = f"""
|
||||||
{user_prompt}
|
{extractionIntent}
|
||||||
|
|
||||||
You are generating a document in {output_format.upper()} format for the title: "{title}".
|
You are generating a document in {outputFormat.upper()} format for the title: "{title}".
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
- The user's intent fully defines the structure. Do not assume a fixed template or headings.
|
||||||
|
|
@ -62,13 +66,99 @@ Common policy:
|
||||||
""".strip()
|
""".strip()
|
||||||
|
|
||||||
# Final assembly
|
# Final assembly
|
||||||
final_prompt = (
|
finalPrompt = (
|
||||||
generic_intro
|
genericIntro
|
||||||
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
|
+ "\n\nFORMAT-SPECIFIC GUIDELINES:\n"
|
||||||
+ format_guidelines.strip()
|
+ formatGuidelines.strip()
|
||||||
+ "\n\nGenerate the complete document content now based on the source documents below:"
|
+ "\n\nGenerate the complete document content now based on the source documents below:"
|
||||||
)
|
)
|
||||||
|
|
||||||
return final_prompt
|
return finalPrompt
|
||||||
|
|
||||||
|
|
||||||
|
async def buildGenerationPrompt(
|
||||||
|
outputFormat: str,
|
||||||
|
userPrompt: str,
|
||||||
|
title: str,
|
||||||
|
aiService=None
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Use AI to build the generation prompt based on user intent and format requirements.
|
||||||
|
Focus on what's important for the user and how to structure the content.
|
||||||
|
"""
|
||||||
|
if not aiService:
|
||||||
|
# Fallback if no AI service available
|
||||||
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Protect userPrompt from injection
|
||||||
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
|
# AI call to generate the appropriate generation prompt
|
||||||
|
generationPromptRequest = f"""
|
||||||
|
Based on this user request, create a detailed generation prompt for creating a {outputFormat} document.
|
||||||
|
|
||||||
|
User request: "{safeUserPrompt}"
|
||||||
|
Document title: "{title}"
|
||||||
|
Output format: {outputFormat}
|
||||||
|
|
||||||
|
Create a generation prompt that:
|
||||||
|
1. Identifies what content is most important for the user
|
||||||
|
2. Specifies how to structure and organize the content. Support with your inputs fo rstructure to match best the user's intention.
|
||||||
|
3. Includes any specific formatting or presentation requirements
|
||||||
|
4. Ensures the document meets the user's needs
|
||||||
|
|
||||||
|
Return only the generation prompt, starting with "Generate a {outputFormat} document that..."
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Call AI service to generate the prompt
|
||||||
|
result = await aiService.callAi(
|
||||||
|
prompt=generationPromptRequest,
|
||||||
|
documents=None,
|
||||||
|
options=None
|
||||||
|
)
|
||||||
|
|
||||||
|
return result if result else f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Fallback on any error
|
||||||
|
return f"Generate a comprehensive {outputFormat} document titled '{title}' based on the extracted content."
|
||||||
|
|
||||||
|
|
||||||
|
async def _parseExtractionIntent(userPrompt: str, outputFormat: str, aiService=None) -> str:
|
||||||
|
"""
|
||||||
|
Use AI to extract the core content intention from the user prompt.
|
||||||
|
Focus on WHAT the user wants to extract, not HOW to format it.
|
||||||
|
"""
|
||||||
|
if not aiService:
|
||||||
|
# Fallback if no AI service available
|
||||||
|
return "Extract all relevant content from the document according to the user's requirements"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Protect userPrompt from injection by escaping quotes and newlines
|
||||||
|
safeUserPrompt = userPrompt.replace('"', '\\"').replace("'", "\\'").replace('\n', ' ').replace('\r', ' ')
|
||||||
|
|
||||||
|
# Simple AI call to extract the intention
|
||||||
|
extractionPrompt = f"""
|
||||||
|
Extract the core content intention from this user request. Focus on WHAT content they want.
|
||||||
|
|
||||||
|
User request: "{safeUserPrompt}"
|
||||||
|
|
||||||
|
Return only the content intention in a simple format like "Extract: [content description]"
|
||||||
|
Do not include formatting instructions, file types, or output methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Call AI service to extract intention
|
||||||
|
result = await aiService.callAi(
|
||||||
|
prompt=extractionPrompt,
|
||||||
|
documents=None,
|
||||||
|
options=None
|
||||||
|
)
|
||||||
|
|
||||||
|
return result if result else "Extract all relevant content from the document according to the user's requirements"
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
# Fallback on any error
|
||||||
|
return "Extract all relevant content from the document according to the user's requirements"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ class DocxRenderer(BaseRenderer):
|
||||||
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
|
"- Structure your response with clear headings using numbered format: 1) Heading, 2) Heading, etc.\n"
|
||||||
"- Use bullet points (-) for lists and sub-items\n"
|
"- Use bullet points (-) for lists and sub-items\n"
|
||||||
"- Use **bold** for emphasis on key terms\n"
|
"- Use **bold** for emphasis on key terms\n"
|
||||||
|
"- Use pipe-separated format (Item | Status) for tables when appropriate\n"
|
||||||
"- Provide clean, structured content that can be directly converted to Word formatting\n"
|
"- Provide clean, structured content that can be directly converted to Word formatting\n"
|
||||||
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
|
"- Do NOT include debug information, separators (---), metadata, or FILENAME headers\n"
|
||||||
"- Start directly with your content - no introductory text or separators\n"
|
"- Start directly with your content - no introductory text or separators\n"
|
||||||
|
|
@ -348,6 +349,40 @@ class DocxRenderer(BaseRenderer):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Could not style table: {str(e)}")
|
self.logger.warning(f"Could not style table: {str(e)}")
|
||||||
|
|
||||||
|
def _process_table_row(self, doc, line: str):
|
||||||
|
"""Process a table row and add it to the document."""
|
||||||
|
if not line.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Split by pipe separator
|
||||||
|
parts = [part.strip() for part in line.split('|')]
|
||||||
|
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# This is a table row - create a table if it doesn't exist
|
||||||
|
if not hasattr(self, '_current_table') or self._current_table is None:
|
||||||
|
# Create new table
|
||||||
|
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||||||
|
self._current_table.style = 'Table Grid'
|
||||||
|
|
||||||
|
# Add header row
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(self._current_table.rows[0].cells):
|
||||||
|
cell = self._current_table.rows[0].cells[i]
|
||||||
|
cell.text = part
|
||||||
|
# Make header bold
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
else:
|
||||||
|
# Add data row to existing table
|
||||||
|
row = self._current_table.add_row()
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(row.cells):
|
||||||
|
row.cells[i].text = part
|
||||||
|
else:
|
||||||
|
# Not a table row, treat as regular text
|
||||||
|
doc.add_paragraph(line)
|
||||||
|
|
||||||
def _clean_ai_content(self, content: str) -> str:
|
def _clean_ai_content(self, content: str) -> str:
|
||||||
"""Clean AI-generated content by removing debug information and duplicates."""
|
"""Clean AI-generated content by removing debug information and duplicates."""
|
||||||
if not content:
|
if not content:
|
||||||
|
|
@ -435,6 +470,16 @@ class DocxRenderer(BaseRenderer):
|
||||||
bullet_text = line[2:] # Remove "- "
|
bullet_text = line[2:] # Remove "- "
|
||||||
self._add_bullet_point(doc, bullet_text)
|
self._add_bullet_point(doc, bullet_text)
|
||||||
|
|
||||||
|
# Check if this is a table row (contains pipe separator)
|
||||||
|
elif '|' in line:
|
||||||
|
# Flush current paragraph
|
||||||
|
if current_paragraph:
|
||||||
|
self._add_paragraph_to_doc(doc, '\n'.join(current_paragraph))
|
||||||
|
current_paragraph = []
|
||||||
|
|
||||||
|
# This is a table row - collect table data
|
||||||
|
self._process_table_row(doc, line)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Regular text - add to current paragraph
|
# Regular text - add to current paragraph
|
||||||
current_paragraph.append(line)
|
current_paragraph.append(line)
|
||||||
|
|
@ -462,9 +507,40 @@ class DocxRenderer(BaseRenderer):
|
||||||
if part:
|
if part:
|
||||||
run = para.add_run(part)
|
run = para.add_run(part)
|
||||||
run.bold = True
|
run.bold = True
|
||||||
|
|
||||||
|
def _process_table_row(self, doc, line: str):
|
||||||
|
"""Process a table row and add it to the document."""
|
||||||
|
if not line.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Split by pipe separator
|
||||||
|
parts = [part.strip() for part in line.split('|')]
|
||||||
|
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# This is a table row - create a table if it doesn't exist
|
||||||
|
if not hasattr(self, '_current_table') or self._current_table is None:
|
||||||
|
# Create new table
|
||||||
|
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||||||
|
self._current_table.style = 'Table Grid'
|
||||||
|
|
||||||
|
# Add header row
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(self._current_table.rows[0].cells):
|
||||||
|
cell = self._current_table.rows[0].cells[i]
|
||||||
|
cell.text = part
|
||||||
|
# Make header bold
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
else:
|
else:
|
||||||
# Regular paragraph
|
# Add data row to existing table
|
||||||
doc.add_paragraph(text)
|
row = self._current_table.add_row()
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(row.cells):
|
||||||
|
row.cells[i].text = part
|
||||||
|
else:
|
||||||
|
# Not a table row, treat as regular text
|
||||||
|
doc.add_paragraph(line)
|
||||||
|
|
||||||
def _add_bullet_point(self, doc, text: str):
|
def _add_bullet_point(self, doc, text: str):
|
||||||
"""Add a bullet point to the document."""
|
"""Add a bullet point to the document."""
|
||||||
|
|
@ -489,3 +565,37 @@ class DocxRenderer(BaseRenderer):
|
||||||
if part:
|
if part:
|
||||||
run = para.add_run(part)
|
run = para.add_run(part)
|
||||||
run.bold = True
|
run.bold = True
|
||||||
|
|
||||||
|
def _process_table_row(self, doc, line: str):
|
||||||
|
"""Process a table row and add it to the document."""
|
||||||
|
if not line.strip():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Split by pipe separator
|
||||||
|
parts = [part.strip() for part in line.split('|')]
|
||||||
|
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# This is a table row - create a table if it doesn't exist
|
||||||
|
if not hasattr(self, '_current_table') or self._current_table is None:
|
||||||
|
# Create new table
|
||||||
|
self._current_table = doc.add_table(rows=1, cols=len(parts))
|
||||||
|
self._current_table.style = 'Table Grid'
|
||||||
|
|
||||||
|
# Add header row
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(self._current_table.rows[0].cells):
|
||||||
|
cell = self._current_table.rows[0].cells[i]
|
||||||
|
cell.text = part
|
||||||
|
# Make header bold
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
else:
|
||||||
|
# Add data row to existing table
|
||||||
|
row = self._current_table.add_row()
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i < len(row.cells):
|
||||||
|
row.cells[i].text = part
|
||||||
|
else:
|
||||||
|
# Not a table row, treat as regular text
|
||||||
|
doc.add_paragraph(line)
|
||||||
|
|
@ -170,10 +170,14 @@ async def process_documents_and_generate_summary():
|
||||||
# Run a single end-to-end test to avoid the loop issue
|
# Run a single end-to-end test to avoid the loop issue
|
||||||
logger.info("🧪 Running single end-to-end test...")
|
logger.info("🧪 Running single end-to-end test...")
|
||||||
|
|
||||||
|
# userPrompt = "Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations."
|
||||||
|
|
||||||
|
userPrompt = "Create a docx file containing a summary and the COMPLETE list from the pdf file, having one additional column with a 'x' marker for all items, which are yellow highlighted."
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Single AI call with DOCX generation
|
# Single AI call with DOCX generation
|
||||||
ai_response = await ai_service.callAi(
|
ai_response = await ai_service.callAi(
|
||||||
prompt="Analyze these documents and create a comprehensive DOCX summary document including: 1) Document types and purposes, 2) Key information and main points, 3) Important details and numbers, 4) Notable sections, 5) Overall assessment and recommendations.",
|
prompt=userPrompt,
|
||||||
documents=documents,
|
documents=documents,
|
||||||
options=ai_options,
|
options=ai_options,
|
||||||
outputFormat="docx",
|
outputFormat="docx",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue