feat: add voice messages recognition

This commit is contained in:
parent 0b9603d9f2
commit b9d6cde8fe
5 changed files with 1152 additions and 16 deletions

View File

@ -1,6 +1,6 @@
from src.integrations.gigachat_api_client import GigaChatClient from src.integrations.gigachat_api_client import GigaChatClient
from src.bot.telegram_userbot import TelegramUserBot from src.bot.telegram_userbot import TelegramUserBot
from src.utils.logging import setup_logging from src.utils import logging
from src.core.configuration import config from src.core.configuration import config
@ -9,7 +9,7 @@ def main() -> None:
Entry point for starting the Telegram user bot. Entry point for starting the Telegram user bot.
""" """
# Configure logging # Configure logging
setup_logging() logging.setup_logging()
# Load API credentials and configuration # Load API credentials and configuration
api_id: str = config.API_ID api_id: str = config.API_ID

989
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
readme = "README.md" readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.13" python = "^3.12"
pyrogram = "^2.0.106" pyrogram = "^2.0.106"
tgcrypto = "^1.2.5" tgcrypto = "^1.2.5"
setuptools = "^75.6.0" setuptools = "^75.6.0"
@ -14,6 +14,15 @@ wheel = "^0.45.1"
langchain-gigachat = "^0.3.2" langchain-gigachat = "^0.3.2"
punq = "^0.7.0" punq = "^0.7.0"
pytest = "^8.3.4" pytest = "^8.3.4"
speechrecognition = "^3.13.0"
typing-extensions = "^4.12.2"
pydub = "^0.25.1"
numpy = "2.0.2"
soundfile = "^0.13.0"
torch = "^2.5.1"
llvmlite = "0.43.0"
numba = "0.60.0"
openai-whisper = "^20240930"
[build-system] [build-system]

View File

@ -1,5 +1,6 @@
import logging import logging
from logging import Logger from logging import Logger
from tempfile import NamedTemporaryFile
from typing import Optional from typing import Optional
from pyrogram import filters from pyrogram import filters
@ -8,6 +9,7 @@ from pyrogram.types import Message
from pyrogram.enums import ChatAction from pyrogram.enums import ChatAction
from src.integrations.gigachat_api_client import GigaChatClient from src.integrations.gigachat_api_client import GigaChatClient
from src.utils import speech_recognition
class TelegramUserBot: class TelegramUserBot:
@ -41,7 +43,8 @@ class TelegramUserBot:
Registers the message handlers for the bot. Registers the message handlers for the bot.
""" """
self.logger.debug("Registering handlers.") self.logger.debug("Registering handlers.")
self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command) self.app.on_message(filters.command("ai"))(self.handle_ai_command)
self.app.on_message(filters.command("voice"))(self.handle_voice_command)
async def handle_ai_command(self, client: Client, message: Message) -> None: async def handle_ai_command(self, client: Client, message: Message) -> None:
""" """
@ -62,12 +65,12 @@ class TelegramUserBot:
if not command_arg: if not command_arg:
self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}") self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
await message.reply_text("Please provide a message after /ai or reply to a message.") await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
return return
# Send an initial message indicating processing # Send an initial message indicating processing
self.logger.debug(f"Processing request for chat_id={message.chat.id}") self.logger.debug(f"Processing request for chat_id={message.chat.id}")
processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...") processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)
try: try:
# Start typing animation # Start typing animation
@ -92,6 +95,54 @@ class TelegramUserBot:
# Handle any errors and notify the user # Handle any errors and notify the user
await processing_message.edit_text("An error occurred while processing your request.") await processing_message.edit_text("An error occurred while processing your request.")
async def handle_voice_command(self, client: Client, message: Message) -> None:
"""
Handle the /voice command to convert a voice message to text.
Args:
client (Client): The Pyrogram Client instance.
message (Message): The incoming message containing the /voice command.
"""
self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
# Check if the reply is to a voice message
if not (message.reply_to_message and message.reply_to_message.voice):
self.logger.warning("The /voice command was not used in reply to a voice message.")
await message.reply("Please reply to a voice message with the /voice command.", quote=True)
return
# Send an initial message indicating processing
processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
with NamedTemporaryFile(delete=False) as temp_file:
file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
self.logger.info(f"Voice message downloaded to {file_path}.")
try:
# Attempt to convert voice to text
text: str = speech_recognition.convert_voice_to_text(file_path) # type: ignore
self.logger.info("Voice message successfully converted to text.")
# Format the text for sending
formatted_text: str = (
"<b>Conversion Result:</b>"
"<pre>"
f"{text}"
"</pre>"
)
# Edit the initial processing message with the converted text
await processing_message.edit_text(formatted_text)
except FileNotFoundError:
self.logger.error("File not found during processing.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
except RuntimeError:
self.logger.error("A runtime error occurred.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
except Exception:
self.logger.error("An unexpected error occurred.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
def run(self) -> None: def run(self) -> None:
""" """
Starts the bot. Starts the bot.

View File

@ -0,0 +1,107 @@
import os
import logging
from logging import Logger
from pydub import AudioSegment
import speech_recognition as sr
from speech_recognition.audio import AudioData
# Configure logging
logger: Logger = logging.getLogger(__name__)
def convert_to_wav(file_path: str) -> str:
"""
Converts an audio file to WAV format if it is not already in WAV format.
Args:
file_path (str): The path to the audio file to be converted.
Returns:
str: The path to the converted or original WAV file.
Raises:
RuntimeError: If the conversion fails for any reason.
"""
if file_path.lower().endswith('.wav'):
logger.info(f"File {file_path} is already in WAV format.")
return file_path
try:
logger.info(f"Converting {file_path} to WAV format.")
audio = AudioSegment.from_file(file_path)
wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
audio.export(wav_path, format="wav")
logger.info(f"File converted to {wav_path}.")
return wav_path
except Exception as e:
logger.error(f"Failed to convert file to WAV: {e}")
raise RuntimeError(f"Failed to convert file to WAV: {e}")
def get_audio_duration(file_path: str) -> float:
"""
Retrieves the duration of an audio file in seconds.
Args:
file_path (str): The path to the audio file.
Returns:
float: The duration of the audio file in seconds.
Raises:
RuntimeError: If unable to get the file duration.
"""
try:
logger.info(f"Getting duration of {file_path}.")
audio = AudioSegment.from_file(file_path)
duration: float = len(audio) / 1000 # Duration in seconds
logger.info(f"Duration of {file_path}: {duration} seconds.")
return duration
except Exception as e:
logger.error(f"Failed to get file duration: {e}")
raise RuntimeError(f"Failed to get file duration: {e}")
def convert_voice_to_text(file_path: str, language='ru') -> str:
"""
Converts speech from an audio file to text using OpenAI speech recognition service.
Args:
file_path (str): The path to the audio file to be processed.
language (str): The language code for speech recognition (default is 'ru').
Returns:
str: The transcribed text if recognition is successful.
Raises:
RuntimeError: For any errors encountered during processing.
"""
# Check if the file exists
if not os.path.exists(file_path):
logger.error(f"File {file_path} does not exist.")
raise FileNotFoundError("File does not exist.")
# Convert the file to WAV format if necessary
try:
wav_path: str = convert_to_wav(file_path)
except RuntimeError as e:
logger.error(f"Error converting to WAV: {e}")
raise RuntimeError(f"Error converting to WAV: {e}")
recognizer = sr.Recognizer()
try:
logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
with sr.AudioFile(wav_path) as source:
audio_data: AudioData = recognizer.record(source)
text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
logger.info("Speech recognition successful.")
return text # type: ignore
except sr.UnknownValueError:
logger.warning(f"Speech in {wav_path} could not be recognized.")
raise RuntimeError("Speech could not be recognized.")
except sr.RequestError as e:
logger.error(f"Request error from the recognition service: {e}")
raise RuntimeError(f"Request error from the recognition service: {e}")
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
raise RuntimeError(f"An unexpected error occurred: {e}")