Módulo de audio y avance imágenes (schemas y openAI)

This commit is contained in:
lansan69
2026-03-31 02:25:58 -06:00
parent 4d6152a9fe
commit fc25a47f04
37 changed files with 538 additions and 82 deletions

View File

@@ -10,9 +10,14 @@ Propósito:
import tempfile
import os
import json
from dotenv import load_dotenv
from fastapi import HTTPException
from openai import OpenAI, AsyncOpenAI
import assemblyai as aai
from deepgram import (
DeepgramClient,
)
from app.core.config import settings
from app.schemas.audio_standard import AudioRequestFile
from app.schemas.audio_standard import StandardTranscriptionResult
@@ -24,6 +29,7 @@ async def transcribe_audio_with_provider(audio_request: AudioRequestFile) -> Sta
"""
Función de adaptador para transcribir audio usando el proveedor de IA configurado.
"""
load_dotenv()
provider = audio_request.provider.lower()
match provider:
@@ -31,6 +37,8 @@ async def transcribe_audio_with_provider(audio_request: AudioRequestFile) -> Sta
return await transcribe_with_openai(audio_request)
case "assemblyai":
return await transcribe_with_assemblyai(audio_request)
case "deepgram":
return await transcribe_with_deepgram(audio_request)
case _:
raise ValueError(f"Proveedor de IA no soportado: {audio_request.provider}")
@@ -39,6 +47,7 @@ async def transcribe_with_openai(audio_request: AudioRequestFile) -> StandardTra
"""
Función de adaptador para transcribir audio usando OpenAI.
"""
client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
audio_content = await audio_request.file.read()
@@ -112,7 +121,8 @@ async def transcribe_with_assemblyai(audio_request: AudioRequestFile) -> Standar
temp_audio_path = temp_audio.name
#Definimos el modelo a usar
config = aai.TranscriptionConfig(language_code="es", speaker_labels=audio_request.diarization,
config = aai.TranscriptionConfig(speech_models = [audio_request.model],
language_code="es", speaker_labels=audio_request.diarization,
sentiment_analysis=audio_request.sentiment)
transcription_obj = aai.Transcriber(config=config).transcribe(temp_audio_path)
@@ -159,5 +169,71 @@ async def transcribe_with_assemblyai(audio_request: AudioRequestFile) -> Standar
os.unlink(temp_audio_path)
except Exception:
pass
# Aquí iría la implementación específica para AssemblyAI, similar a la de OpenAI pero usando su SDK y formato de respuesta
pass
async def transcribe_with_deepgram(audio_request: AudioRequestFile):
"""
Función de adaptador para transcribir audio usando Deepgram.
"""
#Inicializamos el cliente de Deepgram
deepgram = DeepgramClient(api_key=settings.DEEPGRAM_API_KEY)
audio_content = await audio_request.file.read()
temp_audio_path = None
# Validar el audio antes de continuar
validate_audio_request(audio_request, audio_content)
try:
# Crear archivo temporal para el audio
with tempfile.NamedTemporaryFile(
delete=False,
suffix=os.path.splitext(audio_request.file.filename)[1]
) as temp_audio:
temp_audio.write(audio_content)
temp_audio_path = temp_audio.name
with open(temp_audio_path, "rb") as audio_file:
response = deepgram.listen.v1.media.transcribe_file(
request=audio_file.read(),
model=audio_request.model,
sentiment=audio_request.sentiment,
utterances=audio_request.diarization,
diarize=audio_request.diarization,
# Deepgram no tiene una opción específica de "timestamps", pero sí devuelve marcas de tiempo por segmento, así que no es necesario un parámetro adicional para eso
smart_format=True,
language='es',
)
response_json = json.loads(response.json())
result = StandardTranscriptionResult(
status="success",
original_filename=audio_request.file.filename,
full_transcript=response_json.get("results", {}).get("channels", [{}])[0].get("alternatives", [{}])[0].get("transcript", ""),
model_used=audio_request.model,
provider_used="Deepgram",
confidence_score=response_json.get("results", {}).get("channels", [{}])[0].get("alternatives", [{}])[0].get("confidence"),
segments=[
{
"text": sentence.get("text", ""),
"speaker": f"Speaker {paragraph.get('speaker')}" if audio_request.diarization and paragraph.get("speaker") is not None else None,
"start_time": sentence.get("start") if audio_request.timestamps else None,
"end_time": sentence.get("end") if audio_request.timestamps else None,
"sentiment": sentence.get("sentiment") if audio_request.sentiment else None
}
for paragraph in response_json.get("results", {}).get("channels", [{}])[0].get("alternatives", [{}])[0].get("paragraphs", {}).get("paragraphs", [])
for sentence in paragraph.get("sentences", [])
] if (audio_request.diarization or audio_request.timestamps or audio_request.sentiment) else None
)
return result
except Exception as e:
# Capturamos cualquier error de OpenAI o de lectura de archivos
raise HTTPException(
status_code=500,
detail=f"Error transcribiendo el audio: {str(e)}"
)
finally:
if temp_audio_path and os.path.exists(temp_audio_path):
try:
os.unlink(temp_audio_path)
except Exception:
pass