feat: add voice messages recognition

2025-01-04 16:31:40 +04:00 · 2025-01-04 16:31:40 +04:00 · b9d6cde8fe
commit b9d6cde8fe
parent 0b9603d9f2
5 changed files with 1152 additions and 16 deletions
--- a/main.py
+++ b/main.py
@ -1,6 +1,6 @@
 from src.integrations.gigachat_api_client import GigaChatClient
 from src.bot.telegram_userbot import TelegramUserBot
-from src.utils.logging import setup_logging
+from src.utils import logging
 from src.core.configuration import config
@ -9,7 +9,7 @@ def main() -> None:
    Entry point for starting the Telegram user bot.
    """
    # Configure logging
-    setup_logging()
+    logging.setup_logging()
    # Load API credentials and configuration
    api_id: str = config.API_ID
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
 readme = "README.md"
 [tool.poetry.dependencies]
-python = "^3.13"
+python = "^3.12"
 pyrogram = "^2.0.106"
 tgcrypto = "^1.2.5"
 setuptools = "^75.6.0"
@ -14,6 +14,15 @@ wheel = "^0.45.1"
 langchain-gigachat = "^0.3.2"
 punq = "^0.7.0"
 pytest = "^8.3.4"
 speechrecognition = "^3.13.0"
 typing-extensions = "^4.12.2"
 pydub = "^0.25.1"
 numpy = "2.0.2"
 soundfile = "^0.13.0"
 torch = "^2.5.1"
 llvmlite = "0.43.0"
 numba = "0.60.0"
 openai-whisper = "^20240930"
 [build-system]
--- a/src/bot/telegram_userbot.py
+++ b/src/bot/telegram_userbot.py
@ -1,5 +1,6 @@
 import logging
 from logging import Logger
 from tempfile import NamedTemporaryFile
 from typing import Optional
 from pyrogram import filters
@ -8,6 +9,7 @@ from pyrogram.types import Message
 from pyrogram.enums import ChatAction
 from src.integrations.gigachat_api_client import GigaChatClient
 from src.utils import speech_recognition
 class TelegramUserBot:
@ -41,7 +43,8 @@ class TelegramUserBot:
        Registers the message handlers for the bot.
        """
        self.logger.debug("Registering handlers.")
-        self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command)
+        self.app.on_message(filters.command("ai"))(self.handle_ai_command)
        self.app.on_message(filters.command("voice"))(self.handle_voice_command)
    async def handle_ai_command(self, client: Client, message: Message) -> None:
        """
@ -62,12 +65,12 @@ class TelegramUserBot:
        if not command_arg:
            self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
-            await message.reply_text("Please provide a message after /ai or reply to a message.")
+            await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
            return
        # Send an initial message indicating processing
        self.logger.debug(f"Processing request for chat_id={message.chat.id}")
-        processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...")
+        processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)
        try:
            # Start typing animation
@ -92,6 +95,54 @@ class TelegramUserBot:
            # Handle any errors and notify the user
            await processing_message.edit_text("An error occurred while processing your request.")
    async def handle_voice_command(self, client: Client, message: Message) -> None:
        """
        Handle the /voice command to convert a voice message to text.
        Args:
            client (Client): The Pyrogram Client instance.
            message (Message): The incoming message containing the /voice command.
        """
        self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
        # Check if the reply is to a voice message
        if not (message.reply_to_message and message.reply_to_message.voice):
            self.logger.warning("The /voice command was not used in reply to a voice message.")
            await message.reply("Please reply to a voice message with the /voice command.", quote=True)
            return
        # Send an initial message indicating processing
        processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
        with NamedTemporaryFile(delete=False) as temp_file:
            file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
            self.logger.info(f"Voice message downloaded to {file_path}.")
            try:
                # Attempt to convert voice to text
                text: str = speech_recognition.convert_voice_to_text(file_path)  # type: ignore
                self.logger.info("Voice message successfully converted to text.")
                # Format the text for sending
                formatted_text: str = (
                    "<b>Conversion Result:</b>"
                    "<pre>"
                    f"{text}"
                    "</pre>"
                )
                # Edit the initial processing message with the converted text
                await processing_message.edit_text(formatted_text)
            except FileNotFoundError:
                self.logger.error("File not found during processing.", exc_info=True)
                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
            except RuntimeError:
                self.logger.error("A runtime error occurred.", exc_info=True)
                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
            except Exception:
                self.logger.error("An unexpected error occurred.", exc_info=True)
                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
    def run(self) -> None:
        """
        Starts the bot.
--- a/src/utils/speech_recognition.py
+++ b/src/utils/speech_recognition.py
@ -0,0 +1,107 @@
 import os
 import logging
 from logging import Logger
 from pydub import AudioSegment
 import speech_recognition as sr
 from speech_recognition.audio import AudioData
 # Configure logging
 logger: Logger = logging.getLogger(__name__)
 def convert_to_wav(file_path: str) -> str:
    """
    Converts an audio file to WAV format if it is not already in WAV format.
    Args:
        file_path (str): The path to the audio file to be converted.
    Returns:
        str: The path to the converted or original WAV file.
    Raises:
        RuntimeError: If the conversion fails for any reason.
    """
    if file_path.lower().endswith('.wav'):
        logger.info(f"File {file_path} is already in WAV format.")
        return file_path
    try:
        logger.info(f"Converting {file_path} to WAV format.")
        audio = AudioSegment.from_file(file_path)
        wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
        audio.export(wav_path, format="wav")
        logger.info(f"File converted to {wav_path}.")
        return wav_path
    except Exception as e:
        logger.error(f"Failed to convert file to WAV: {e}")
        raise RuntimeError(f"Failed to convert file to WAV: {e}")
 def get_audio_duration(file_path: str) -> float:
    """
    Retrieves the duration of an audio file in seconds.
    Args:
        file_path (str): The path to the audio file.
    Returns:
        float: The duration of the audio file in seconds.
    Raises:
        RuntimeError: If unable to get the file duration.
    """
    try:
        logger.info(f"Getting duration of {file_path}.")
        audio = AudioSegment.from_file(file_path)
        duration: float = len(audio) / 1000  # Duration in seconds
        logger.info(f"Duration of {file_path}: {duration} seconds.")
        return duration
    except Exception as e:
        logger.error(f"Failed to get file duration: {e}")
        raise RuntimeError(f"Failed to get file duration: {e}")
 def convert_voice_to_text(file_path: str, language='ru') -> str:
    """
    Converts speech from an audio file to text using OpenAI speech recognition service.
    Args:
        file_path (str): The path to the audio file to be processed.
        language (str): The language code for speech recognition (default is 'ru').
    Returns:
        str: The transcribed text if recognition is successful.
    Raises:
        RuntimeError: For any errors encountered during processing.
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        logger.error(f"File {file_path} does not exist.")
        raise FileNotFoundError("File does not exist.")
    # Convert the file to WAV format if necessary
    try:
        wav_path: str = convert_to_wav(file_path)
    except RuntimeError as e:
        logger.error(f"Error converting to WAV: {e}")
        raise RuntimeError(f"Error converting to WAV: {e}")
    recognizer = sr.Recognizer()
    try:
        logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
        with sr.AudioFile(wav_path) as source:
            audio_data: AudioData = recognizer.record(source)
            text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
            logger.info("Speech recognition successful.")
            return text  # type: ignore
    except sr.UnknownValueError:
        logger.warning(f"Speech in {wav_path} could not be recognized.")
        raise RuntimeError("Speech could not be recognized.")
    except sr.RequestError as e:
        logger.error(f"Request error from the recognition service: {e}")
        raise RuntimeError(f"Request error from the recognition service: {e}")
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        raise RuntimeError(f"An unexpected error occurred: {e}")