feat: add voice messages recognition

2025-01-04 16:31:40 +04:00 · 2025-01-04 16:31:40 +04:00 · b9d6cde8fe
commit b9d6cde8fe
parent 0b9603d9f2
5 changed files with 1152 additions and 16 deletions
--- a/main.py
+++ b/main.py
@ -1,6 +1,6 @@
 from src.integrations.gigachat_api_client import GigaChatClient
 from src.bot.telegram_userbot import TelegramUserBot
-from src.utils.logging import setup_logging
+from src.utils import logging
 from src.core.configuration import config


@ -9,7 +9,7 @@ def main() -> None:
    Entry point for starting the Telegram user bot.
    """
    # Configure logging
-    setup_logging()
+    logging.setup_logging()
    
    # Load API credentials and configuration
    api_id: str = config.API_ID
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
 readme = "README.md"

 [tool.poetry.dependencies]
-python = "^3.13"
+python = "^3.12"
 pyrogram = "^2.0.106"
 tgcrypto = "^1.2.5"
 setuptools = "^75.6.0"
@ -14,6 +14,15 @@ wheel = "^0.45.1"
 langchain-gigachat = "^0.3.2"
 punq = "^0.7.0"
 pytest = "^8.3.4"
+speechrecognition = "^3.13.0"
+typing-extensions = "^4.12.2"
+pydub = "^0.25.1"
+numpy = "2.0.2"
+soundfile = "^0.13.0"
+torch = "^2.5.1"
+llvmlite = "0.43.0"
+numba = "0.60.0"
+openai-whisper = "^20240930"


 [build-system]
--- a/src/bot/telegram_userbot.py
+++ b/src/bot/telegram_userbot.py
@ -1,5 +1,6 @@
 import logging
 from logging import Logger
+from tempfile import NamedTemporaryFile
 from typing import Optional

 from pyrogram import filters
@ -8,6 +9,7 @@ from pyrogram.types import Message
 from pyrogram.enums import ChatAction

 from src.integrations.gigachat_api_client import GigaChatClient
+from src.utils import speech_recognition


 class TelegramUserBot:
@ -41,7 +43,8 @@ class TelegramUserBot:
        Registers the message handlers for the bot.
        """
        self.logger.debug("Registering handlers.")
-        self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command)
+        self.app.on_message(filters.command("ai"))(self.handle_ai_command)
+        self.app.on_message(filters.command("voice"))(self.handle_voice_command)

    async def handle_ai_command(self, client: Client, message: Message) -> None:
        """
@ -62,12 +65,12 @@ class TelegramUserBot:

        if not command_arg:
            self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
-            await message.reply_text("Please provide a message after /ai or reply to a message.")
+            await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
            return

        # Send an initial message indicating processing
        self.logger.debug(f"Processing request for chat_id={message.chat.id}")
-        processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...")
+        processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)

        try:
            # Start typing animation
@ -92,6 +95,54 @@ class TelegramUserBot:
            # Handle any errors and notify the user
            await processing_message.edit_text("An error occurred while processing your request.")

+    async def handle_voice_command(self, client: Client, message: Message) -> None:
+        """
+        Handle the /voice command to convert a voice message to text.
+
+        Args:
+            client (Client): The Pyrogram Client instance.
+            message (Message): The incoming message containing the /voice command.
+        """
+        self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
+
+        # Check if the reply is to a voice message
+        if not (message.reply_to_message and message.reply_to_message.voice):
+            self.logger.warning("The /voice command was not used in reply to a voice message.")
+            await message.reply("Please reply to a voice message with the /voice command.", quote=True)
+            return
+
+        # Send an initial message indicating processing
+        processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
+
+        with NamedTemporaryFile(delete=False) as temp_file:
+            file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
+            self.logger.info(f"Voice message downloaded to {file_path}.")
+
+            try:
+                # Attempt to convert voice to text
+                text: str = speech_recognition.convert_voice_to_text(file_path)  # type: ignore
+                self.logger.info("Voice message successfully converted to text.")
+
+                # Format the text for sending
+                formatted_text: str = (
+                    "<b>Conversion Result:</b>"
+                    "<pre>"
+                    f"{text}"
+                    "</pre>"
+                )
+
+                # Edit the initial processing message with the converted text
+                await processing_message.edit_text(formatted_text)
+            except FileNotFoundError:
+                self.logger.error("File not found during processing.", exc_info=True)
+                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
+            except RuntimeError:
+                self.logger.error("A runtime error occurred.", exc_info=True)
+                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
+            except Exception:
+                self.logger.error("An unexpected error occurred.", exc_info=True)
+                await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
+
    def run(self) -> None:
        """
        Starts the bot.
--- a/src/utils/speech_recognition.py
+++ b/src/utils/speech_recognition.py
@ -0,0 +1,107 @@
+import os
+import logging
+from logging import Logger
+
+from pydub import AudioSegment
+import speech_recognition as sr
+from speech_recognition.audio import AudioData
+
+
+# Configure logging
+logger: Logger = logging.getLogger(__name__)
+
+def convert_to_wav(file_path: str) -> str:
+    """
+    Converts an audio file to WAV format if it is not already in WAV format.
+
+    Args:
+        file_path (str): The path to the audio file to be converted.
+
+    Returns:
+        str: The path to the converted or original WAV file.
+
+    Raises:
+        RuntimeError: If the conversion fails for any reason.
+    """
+    if file_path.lower().endswith('.wav'):
+        logger.info(f"File {file_path} is already in WAV format.")
+        return file_path
+
+    try:
+        logger.info(f"Converting {file_path} to WAV format.")
+        audio = AudioSegment.from_file(file_path)
+        wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
+        audio.export(wav_path, format="wav")
+        logger.info(f"File converted to {wav_path}.")
+        return wav_path
+    except Exception as e:
+        logger.error(f"Failed to convert file to WAV: {e}")
+        raise RuntimeError(f"Failed to convert file to WAV: {e}")
+
+def get_audio_duration(file_path: str) -> float:
+    """
+    Retrieves the duration of an audio file in seconds.
+
+    Args:
+        file_path (str): The path to the audio file.
+
+    Returns:
+        float: The duration of the audio file in seconds.
+
+    Raises:
+        RuntimeError: If unable to get the file duration.
+    """
+    try:
+        logger.info(f"Getting duration of {file_path}.")
+        audio = AudioSegment.from_file(file_path)
+        duration: float = len(audio) / 1000  # Duration in seconds
+        logger.info(f"Duration of {file_path}: {duration} seconds.")
+        return duration
+    except Exception as e:
+        logger.error(f"Failed to get file duration: {e}")
+        raise RuntimeError(f"Failed to get file duration: {e}")
+
+def convert_voice_to_text(file_path: str, language='ru') -> str:
+    """
+    Converts speech from an audio file to text using OpenAI speech recognition service.
+
+    Args:
+        file_path (str): The path to the audio file to be processed.
+        language (str): The language code for speech recognition (default is 'ru').
+
+    Returns:
+        str: The transcribed text if recognition is successful.
+
+    Raises:
+        RuntimeError: For any errors encountered during processing.
+    """
+    # Check if the file exists
+    if not os.path.exists(file_path):
+        logger.error(f"File {file_path} does not exist.")
+        raise FileNotFoundError("File does not exist.")
+
+    # Convert the file to WAV format if necessary
+    try:
+        wav_path: str = convert_to_wav(file_path)
+    except RuntimeError as e:
+        logger.error(f"Error converting to WAV: {e}")
+        raise RuntimeError(f"Error converting to WAV: {e}")
+
+    recognizer = sr.Recognizer()
+
+    try:
+        logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
+        with sr.AudioFile(wav_path) as source:
+            audio_data: AudioData = recognizer.record(source)
+            text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
+            logger.info("Speech recognition successful.")
+            return text  # type: ignore
+    except sr.UnknownValueError:
+        logger.warning(f"Speech in {wav_path} could not be recognized.")
+        raise RuntimeError("Speech could not be recognized.")
+    except sr.RequestError as e:
+        logger.error(f"Request error from the recognition service: {e}")
+        raise RuntimeError(f"Request error from the recognition service: {e}")
+    except Exception as e:
+        logger.error(f"An unexpected error occurred: {e}")
+        raise RuntimeError(f"An unexpected error occurred: {e}")