122 lines
3.9 KiB
Python
122 lines
3.9 KiB
Python
from anthropic import Anthropic
|
|
import base64
|
|
import magic
|
|
import os
|
|
from typing import Dict, Any, Union, List
|
|
|
|
def create_message_with_document(file_path: str, prompt_text: str = "Bitte analysiere dieses Dokument:") -> Dict[str, Any]:
|
|
"""
|
|
Erstellt ein Message-Objekt für die Anthropic API, das ein Dokument enthält.
|
|
|
|
Args:
|
|
file_path: Pfad zur Datei
|
|
prompt_text: Text, der zusammen mit dem Dokument gesendet werden soll
|
|
|
|
Returns:
|
|
Ein Message-Objekt für die Anthropic API
|
|
"""
|
|
# Datei einlesen und als Base64 kodieren
|
|
with open(file_path, "rb") as file:
|
|
file_content = file.read()
|
|
base64_file = base64.b64encode(file_content).decode('utf-8')
|
|
|
|
# Mime-Typ der Datei mit python-magic erkennen
|
|
mime_type = magic.from_buffer(file_content, mime=True)
|
|
|
|
# Fallback auf Dateiendung, wenn magic keine klare Erkennung liefert
|
|
if mime_type == "application/octet-stream":
|
|
extension = os.path.splitext(file_path)[1].lower()[1:]
|
|
mime_type = get_mime_type_from_extension(extension)
|
|
|
|
# Message-Objekt erstellen
|
|
content_type, message_structure = determine_content_structure(mime_type)
|
|
|
|
message = {
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": prompt_text
|
|
},
|
|
{
|
|
"type": content_type,
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": mime_type,
|
|
"data": base64_file
|
|
}
|
|
}
|
|
]
|
|
}
|
|
|
|
return message
|
|
|
|
def determine_content_structure(mime_type: str) -> tuple[str, str]:
|
|
"""
|
|
Bestimmt den richtigen content_type und die Nachrichtenstruktur basierend auf dem MIME-Typ.
|
|
|
|
Args:
|
|
mime_type: Der MIME-Typ der Datei
|
|
|
|
Returns:
|
|
Tuple mit (content_type, message_structure)
|
|
"""
|
|
# Bildtypen
|
|
if mime_type.startswith("image/"):
|
|
return "image", "image"
|
|
|
|
# Dokumenttypen
|
|
document_types = [
|
|
"application/pdf",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", # docx
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", # xlsx
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation", # pptx
|
|
"application/vnd.ms-excel",
|
|
"application/vnd.ms-powerpoint",
|
|
"application/msword",
|
|
"text/csv",
|
|
"text/plain",
|
|
"application/json",
|
|
"application/xml",
|
|
"text/html"
|
|
]
|
|
|
|
if any(mime_type.startswith(dt) for dt in document_types) or mime_type in document_types:
|
|
return "document", "document"
|
|
|
|
# Fallback für unbekannte Typen
|
|
return "document", "document"
|
|
|
|
def get_mime_type_from_extension(extension: str) -> str:
|
|
"""
|
|
Bestimmt den MIME-Typ basierend auf der Dateiendung.
|
|
|
|
Args:
|
|
extension: Die Dateiendung ohne Punkt
|
|
|
|
Returns:
|
|
Der entsprechende MIME-Typ
|
|
"""
|
|
extension_to_mime = {
|
|
"pdf": "application/pdf",
|
|
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"doc": "application/msword",
|
|
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
"xls": "application/vnd.ms-excel",
|
|
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
"ppt": "application/vnd.ms-powerpoint",
|
|
"csv": "text/csv",
|
|
"txt": "text/plain",
|
|
"json": "application/json",
|
|
"xml": "application/xml",
|
|
"html": "text/html",
|
|
"htm": "text/html",
|
|
"jpg": "image/jpeg",
|
|
"jpeg": "image/jpeg",
|
|
"png": "image/png",
|
|
"gif": "image/gif",
|
|
"webp": "image/webp",
|
|
"svg": "image/svg+xml"
|
|
}
|
|
|
|
return extension_to_mime.get(extension, "application/octet-stream")
|