feat: add voice messages recognition
This commit is contained in:
parent
0b9603d9f2
commit
b9d6cde8fe
4
main.py
4
main.py
@ -1,6 +1,6 @@
|
||||
from src.integrations.gigachat_api_client import GigaChatClient
|
||||
from src.bot.telegram_userbot import TelegramUserBot
|
||||
from src.utils.logging import setup_logging
|
||||
from src.utils import logging
|
||||
from src.core.configuration import config
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ def main() -> None:
|
||||
Entry point for starting the Telegram user bot.
|
||||
"""
|
||||
# Configure logging
|
||||
setup_logging()
|
||||
logging.setup_logging()
|
||||
|
||||
# Load API credentials and configuration
|
||||
api_id: str = config.API_ID
|
||||
|
989
poetry.lock
generated
989
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.13"
|
||||
python = "^3.12"
|
||||
pyrogram = "^2.0.106"
|
||||
tgcrypto = "^1.2.5"
|
||||
setuptools = "^75.6.0"
|
||||
@ -14,6 +14,15 @@ wheel = "^0.45.1"
|
||||
langchain-gigachat = "^0.3.2"
|
||||
punq = "^0.7.0"
|
||||
pytest = "^8.3.4"
|
||||
speechrecognition = "^3.13.0"
|
||||
typing-extensions = "^4.12.2"
|
||||
pydub = "^0.25.1"
|
||||
numpy = "2.0.2"
|
||||
soundfile = "^0.13.0"
|
||||
torch = "^2.5.1"
|
||||
llvmlite = "0.43.0"
|
||||
numba = "0.60.0"
|
||||
openai-whisper = "^20240930"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from logging import Logger
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Optional
|
||||
|
||||
from pyrogram import filters
|
||||
@ -8,6 +9,7 @@ from pyrogram.types import Message
|
||||
from pyrogram.enums import ChatAction
|
||||
|
||||
from src.integrations.gigachat_api_client import GigaChatClient
|
||||
from src.utils import speech_recognition
|
||||
|
||||
|
||||
class TelegramUserBot:
|
||||
@ -41,7 +43,8 @@ class TelegramUserBot:
|
||||
Registers the message handlers for the bot.
|
||||
"""
|
||||
self.logger.debug("Registering handlers.")
|
||||
self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command)
|
||||
self.app.on_message(filters.command("ai"))(self.handle_ai_command)
|
||||
self.app.on_message(filters.command("voice"))(self.handle_voice_command)
|
||||
|
||||
async def handle_ai_command(self, client: Client, message: Message) -> None:
|
||||
"""
|
||||
@ -62,12 +65,12 @@ class TelegramUserBot:
|
||||
|
||||
if not command_arg:
|
||||
self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
|
||||
await message.reply_text("Please provide a message after /ai or reply to a message.")
|
||||
await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
|
||||
return
|
||||
|
||||
# Send an initial message indicating processing
|
||||
self.logger.debug(f"Processing request for chat_id={message.chat.id}")
|
||||
processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...")
|
||||
processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)
|
||||
|
||||
try:
|
||||
# Start typing animation
|
||||
@ -92,6 +95,54 @@ class TelegramUserBot:
|
||||
# Handle any errors and notify the user
|
||||
await processing_message.edit_text("An error occurred while processing your request.")
|
||||
|
||||
async def handle_voice_command(self, client: Client, message: Message) -> None:
|
||||
"""
|
||||
Handle the /voice command to convert a voice message to text.
|
||||
|
||||
Args:
|
||||
client (Client): The Pyrogram Client instance.
|
||||
message (Message): The incoming message containing the /voice command.
|
||||
"""
|
||||
self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
|
||||
|
||||
# Check if the reply is to a voice message
|
||||
if not (message.reply_to_message and message.reply_to_message.voice):
|
||||
self.logger.warning("The /voice command was not used in reply to a voice message.")
|
||||
await message.reply("Please reply to a voice message with the /voice command.", quote=True)
|
||||
return
|
||||
|
||||
# Send an initial message indicating processing
|
||||
processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
|
||||
|
||||
with NamedTemporaryFile(delete=False) as temp_file:
|
||||
file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
|
||||
self.logger.info(f"Voice message downloaded to {file_path}.")
|
||||
|
||||
try:
|
||||
# Attempt to convert voice to text
|
||||
text: str = speech_recognition.convert_voice_to_text(file_path) # type: ignore
|
||||
self.logger.info("Voice message successfully converted to text.")
|
||||
|
||||
# Format the text for sending
|
||||
formatted_text: str = (
|
||||
"<b>Conversion Result:</b>"
|
||||
"<pre>"
|
||||
f"{text}"
|
||||
"</pre>"
|
||||
)
|
||||
|
||||
# Edit the initial processing message with the converted text
|
||||
await processing_message.edit_text(formatted_text)
|
||||
except FileNotFoundError:
|
||||
self.logger.error("File not found during processing.", exc_info=True)
|
||||
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||
except RuntimeError:
|
||||
self.logger.error("A runtime error occurred.", exc_info=True)
|
||||
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||
except Exception:
|
||||
self.logger.error("An unexpected error occurred.", exc_info=True)
|
||||
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||
|
||||
def run(self) -> None:
|
||||
"""
|
||||
Starts the bot.
|
||||
|
107
src/utils/speech_recognition.py
Normal file
107
src/utils/speech_recognition.py
Normal file
@ -0,0 +1,107 @@
|
||||
import os
|
||||
import logging
|
||||
from logging import Logger
|
||||
|
||||
from pydub import AudioSegment
|
||||
import speech_recognition as sr
|
||||
from speech_recognition.audio import AudioData
|
||||
|
||||
|
||||
# Configure logging
|
||||
logger: Logger = logging.getLogger(__name__)
|
||||
|
||||
def convert_to_wav(file_path: str) -> str:
|
||||
"""
|
||||
Converts an audio file to WAV format if it is not already in WAV format.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the audio file to be converted.
|
||||
|
||||
Returns:
|
||||
str: The path to the converted or original WAV file.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the conversion fails for any reason.
|
||||
"""
|
||||
if file_path.lower().endswith('.wav'):
|
||||
logger.info(f"File {file_path} is already in WAV format.")
|
||||
return file_path
|
||||
|
||||
try:
|
||||
logger.info(f"Converting {file_path} to WAV format.")
|
||||
audio = AudioSegment.from_file(file_path)
|
||||
wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
|
||||
audio.export(wav_path, format="wav")
|
||||
logger.info(f"File converted to {wav_path}.")
|
||||
return wav_path
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert file to WAV: {e}")
|
||||
raise RuntimeError(f"Failed to convert file to WAV: {e}")
|
||||
|
||||
def get_audio_duration(file_path: str) -> float:
|
||||
"""
|
||||
Retrieves the duration of an audio file in seconds.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the audio file.
|
||||
|
||||
Returns:
|
||||
float: The duration of the audio file in seconds.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If unable to get the file duration.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Getting duration of {file_path}.")
|
||||
audio = AudioSegment.from_file(file_path)
|
||||
duration: float = len(audio) / 1000 # Duration in seconds
|
||||
logger.info(f"Duration of {file_path}: {duration} seconds.")
|
||||
return duration
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file duration: {e}")
|
||||
raise RuntimeError(f"Failed to get file duration: {e}")
|
||||
|
||||
def convert_voice_to_text(file_path: str, language='ru') -> str:
|
||||
"""
|
||||
Converts speech from an audio file to text using OpenAI speech recognition service.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the audio file to be processed.
|
||||
language (str): The language code for speech recognition (default is 'ru').
|
||||
|
||||
Returns:
|
||||
str: The transcribed text if recognition is successful.
|
||||
|
||||
Raises:
|
||||
RuntimeError: For any errors encountered during processing.
|
||||
"""
|
||||
# Check if the file exists
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"File {file_path} does not exist.")
|
||||
raise FileNotFoundError("File does not exist.")
|
||||
|
||||
# Convert the file to WAV format if necessary
|
||||
try:
|
||||
wav_path: str = convert_to_wav(file_path)
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Error converting to WAV: {e}")
|
||||
raise RuntimeError(f"Error converting to WAV: {e}")
|
||||
|
||||
recognizer = sr.Recognizer()
|
||||
|
||||
try:
|
||||
logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
|
||||
with sr.AudioFile(wav_path) as source:
|
||||
audio_data: AudioData = recognizer.record(source)
|
||||
text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
|
||||
logger.info("Speech recognition successful.")
|
||||
return text # type: ignore
|
||||
except sr.UnknownValueError:
|
||||
logger.warning(f"Speech in {wav_path} could not be recognized.")
|
||||
raise RuntimeError("Speech could not be recognized.")
|
||||
except sr.RequestError as e:
|
||||
logger.error(f"Request error from the recognition service: {e}")
|
||||
raise RuntimeError(f"Request error from the recognition service: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred: {e}")
|
||||
raise RuntimeError(f"An unexpected error occurred: {e}")
|
Loading…
Reference in New Issue
Block a user