feat: add voice messages recognition

This commit is contained in:
parent 0b9603d9f2
commit b9d6cde8fe
5 changed files with 1152 additions and 16 deletions

View File

@ -1,6 +1,6 @@
from src.integrations.gigachat_api_client import GigaChatClient
from src.bot.telegram_userbot import TelegramUserBot
from src.utils.logging import setup_logging
from src.utils import logging
from src.core.configuration import config
@ -9,7 +9,7 @@ def main() -> None:
Entry point for starting the Telegram user bot.
"""
# Configure logging
setup_logging()
logging.setup_logging()
# Load API credentials and configuration
api_id: str = config.API_ID

989
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.13"
python = "^3.12"
pyrogram = "^2.0.106"
tgcrypto = "^1.2.5"
setuptools = "^75.6.0"
@ -14,6 +14,15 @@ wheel = "^0.45.1"
langchain-gigachat = "^0.3.2"
punq = "^0.7.0"
pytest = "^8.3.4"
speechrecognition = "^3.13.0"
typing-extensions = "^4.12.2"
pydub = "^0.25.1"
numpy = "2.0.2"
soundfile = "^0.13.0"
torch = "^2.5.1"
llvmlite = "0.43.0"
numba = "0.60.0"
openai-whisper = "^20240930"
[build-system]

View File

@ -1,5 +1,6 @@
import logging
from logging import Logger
from tempfile import NamedTemporaryFile
from typing import Optional
from pyrogram import filters
@ -8,6 +9,7 @@ from pyrogram.types import Message
from pyrogram.enums import ChatAction
from src.integrations.gigachat_api_client import GigaChatClient
from src.utils import speech_recognition
class TelegramUserBot:
@ -41,7 +43,8 @@ class TelegramUserBot:
Registers the message handlers for the bot.
"""
self.logger.debug("Registering handlers.")
self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command)
self.app.on_message(filters.command("ai"))(self.handle_ai_command)
self.app.on_message(filters.command("voice"))(self.handle_voice_command)
async def handle_ai_command(self, client: Client, message: Message) -> None:
"""
@ -62,12 +65,12 @@ class TelegramUserBot:
if not command_arg:
self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
await message.reply_text("Please provide a message after /ai or reply to a message.")
await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
return
# Send an initial message indicating processing
self.logger.debug(f"Processing request for chat_id={message.chat.id}")
processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...")
processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)
try:
# Start typing animation
@ -92,6 +95,54 @@ class TelegramUserBot:
# Handle any errors and notify the user
await processing_message.edit_text("An error occurred while processing your request.")
async def handle_voice_command(self, client: Client, message: Message) -> None:
"""
Handle the /voice command to convert a voice message to text.
Args:
client (Client): The Pyrogram Client instance.
message (Message): The incoming message containing the /voice command.
"""
self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
# Check if the reply is to a voice message
if not (message.reply_to_message and message.reply_to_message.voice):
self.logger.warning("The /voice command was not used in reply to a voice message.")
await message.reply("Please reply to a voice message with the /voice command.", quote=True)
return
# Send an initial message indicating processing
processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
with NamedTemporaryFile(delete=False) as temp_file:
file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
self.logger.info(f"Voice message downloaded to {file_path}.")
try:
# Attempt to convert voice to text
text: str = speech_recognition.convert_voice_to_text(file_path) # type: ignore
self.logger.info("Voice message successfully converted to text.")
# Format the text for sending
formatted_text: str = (
"<b>Conversion Result:</b>"
"<pre>"
f"{text}"
"</pre>"
)
# Edit the initial processing message with the converted text
await processing_message.edit_text(formatted_text)
except FileNotFoundError:
self.logger.error("File not found during processing.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
except RuntimeError:
self.logger.error("A runtime error occurred.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
except Exception:
self.logger.error("An unexpected error occurred.", exc_info=True)
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
def run(self) -> None:
"""
Starts the bot.

View File

@ -0,0 +1,107 @@
import os
import logging
from logging import Logger
from pydub import AudioSegment
import speech_recognition as sr
from speech_recognition.audio import AudioData
# Configure logging
logger: Logger = logging.getLogger(__name__)
def convert_to_wav(file_path: str) -> str:
"""
Converts an audio file to WAV format if it is not already in WAV format.
Args:
file_path (str): The path to the audio file to be converted.
Returns:
str: The path to the converted or original WAV file.
Raises:
RuntimeError: If the conversion fails for any reason.
"""
if file_path.lower().endswith('.wav'):
logger.info(f"File {file_path} is already in WAV format.")
return file_path
try:
logger.info(f"Converting {file_path} to WAV format.")
audio = AudioSegment.from_file(file_path)
wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
audio.export(wav_path, format="wav")
logger.info(f"File converted to {wav_path}.")
return wav_path
except Exception as e:
logger.error(f"Failed to convert file to WAV: {e}")
raise RuntimeError(f"Failed to convert file to WAV: {e}")
def get_audio_duration(file_path: str) -> float:
"""
Retrieves the duration of an audio file in seconds.
Args:
file_path (str): The path to the audio file.
Returns:
float: The duration of the audio file in seconds.
Raises:
RuntimeError: If unable to get the file duration.
"""
try:
logger.info(f"Getting duration of {file_path}.")
audio = AudioSegment.from_file(file_path)
duration: float = len(audio) / 1000 # Duration in seconds
logger.info(f"Duration of {file_path}: {duration} seconds.")
return duration
except Exception as e:
logger.error(f"Failed to get file duration: {e}")
raise RuntimeError(f"Failed to get file duration: {e}")
def convert_voice_to_text(file_path: str, language='ru') -> str:
"""
Converts speech from an audio file to text using OpenAI speech recognition service.
Args:
file_path (str): The path to the audio file to be processed.
language (str): The language code for speech recognition (default is 'ru').
Returns:
str: The transcribed text if recognition is successful.
Raises:
RuntimeError: For any errors encountered during processing.
"""
# Check if the file exists
if not os.path.exists(file_path):
logger.error(f"File {file_path} does not exist.")
raise FileNotFoundError("File does not exist.")
# Convert the file to WAV format if necessary
try:
wav_path: str = convert_to_wav(file_path)
except RuntimeError as e:
logger.error(f"Error converting to WAV: {e}")
raise RuntimeError(f"Error converting to WAV: {e}")
recognizer = sr.Recognizer()
try:
logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
with sr.AudioFile(wav_path) as source:
audio_data: AudioData = recognizer.record(source)
text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
logger.info("Speech recognition successful.")
return text # type: ignore
except sr.UnknownValueError:
logger.warning(f"Speech in {wav_path} could not be recognized.")
raise RuntimeError("Speech could not be recognized.")
except sr.RequestError as e:
logger.error(f"Request error from the recognition service: {e}")
raise RuntimeError(f"Request error from the recognition service: {e}")
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
raise RuntimeError(f"An unexpected error occurred: {e}")