feat: add voice messages recognition
This commit is contained in:
parent
0b9603d9f2
commit
b9d6cde8fe
4
main.py
4
main.py
@ -1,6 +1,6 @@
|
|||||||
from src.integrations.gigachat_api_client import GigaChatClient
|
from src.integrations.gigachat_api_client import GigaChatClient
|
||||||
from src.bot.telegram_userbot import TelegramUserBot
|
from src.bot.telegram_userbot import TelegramUserBot
|
||||||
from src.utils.logging import setup_logging
|
from src.utils import logging
|
||||||
from src.core.configuration import config
|
from src.core.configuration import config
|
||||||
|
|
||||||
|
|
||||||
@ -9,7 +9,7 @@ def main() -> None:
|
|||||||
Entry point for starting the Telegram user bot.
|
Entry point for starting the Telegram user bot.
|
||||||
"""
|
"""
|
||||||
# Configure logging
|
# Configure logging
|
||||||
setup_logging()
|
logging.setup_logging()
|
||||||
|
|
||||||
# Load API credentials and configuration
|
# Load API credentials and configuration
|
||||||
api_id: str = config.API_ID
|
api_id: str = config.API_ID
|
||||||
|
989
poetry.lock
generated
989
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@ authors = ["Factorino73 <masenkin73@xmail.ru>"]
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.13"
|
python = "^3.12"
|
||||||
pyrogram = "^2.0.106"
|
pyrogram = "^2.0.106"
|
||||||
tgcrypto = "^1.2.5"
|
tgcrypto = "^1.2.5"
|
||||||
setuptools = "^75.6.0"
|
setuptools = "^75.6.0"
|
||||||
@ -14,6 +14,15 @@ wheel = "^0.45.1"
|
|||||||
langchain-gigachat = "^0.3.2"
|
langchain-gigachat = "^0.3.2"
|
||||||
punq = "^0.7.0"
|
punq = "^0.7.0"
|
||||||
pytest = "^8.3.4"
|
pytest = "^8.3.4"
|
||||||
|
speechrecognition = "^3.13.0"
|
||||||
|
typing-extensions = "^4.12.2"
|
||||||
|
pydub = "^0.25.1"
|
||||||
|
numpy = "2.0.2"
|
||||||
|
soundfile = "^0.13.0"
|
||||||
|
torch = "^2.5.1"
|
||||||
|
llvmlite = "0.43.0"
|
||||||
|
numba = "0.60.0"
|
||||||
|
openai-whisper = "^20240930"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from logging import Logger
|
from logging import Logger
|
||||||
|
from tempfile import NamedTemporaryFile
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from pyrogram import filters
|
from pyrogram import filters
|
||||||
@ -8,6 +9,7 @@ from pyrogram.types import Message
|
|||||||
from pyrogram.enums import ChatAction
|
from pyrogram.enums import ChatAction
|
||||||
|
|
||||||
from src.integrations.gigachat_api_client import GigaChatClient
|
from src.integrations.gigachat_api_client import GigaChatClient
|
||||||
|
from src.utils import speech_recognition
|
||||||
|
|
||||||
|
|
||||||
class TelegramUserBot:
|
class TelegramUserBot:
|
||||||
@ -41,7 +43,8 @@ class TelegramUserBot:
|
|||||||
Registers the message handlers for the bot.
|
Registers the message handlers for the bot.
|
||||||
"""
|
"""
|
||||||
self.logger.debug("Registering handlers.")
|
self.logger.debug("Registering handlers.")
|
||||||
self.app.on_message(filters.command("ai") & filters.text)(self.handle_ai_command)
|
self.app.on_message(filters.command("ai"))(self.handle_ai_command)
|
||||||
|
self.app.on_message(filters.command("voice"))(self.handle_voice_command)
|
||||||
|
|
||||||
async def handle_ai_command(self, client: Client, message: Message) -> None:
|
async def handle_ai_command(self, client: Client, message: Message) -> None:
|
||||||
"""
|
"""
|
||||||
@ -62,12 +65,12 @@ class TelegramUserBot:
|
|||||||
|
|
||||||
if not command_arg:
|
if not command_arg:
|
||||||
self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
|
self.logger.warning(f"No argument or replied message provided for /ai command by chat_id={message.chat.id}")
|
||||||
await message.reply_text("Please provide a message after /ai or reply to a message.")
|
await message.reply("Please provide a message after /ai or reply to a message.", quote=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Send an initial message indicating processing
|
# Send an initial message indicating processing
|
||||||
self.logger.debug(f"Processing request for chat_id={message.chat.id}")
|
self.logger.debug(f"Processing request for chat_id={message.chat.id}")
|
||||||
processing_message: Message = await message.reply_text(f"{self.gigachat_client.model_name} is processing your request...")
|
processing_message: Message = await message.reply(f"{self.gigachat_client.model_name} is processing your request...", quote=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Start typing animation
|
# Start typing animation
|
||||||
@ -92,6 +95,54 @@ class TelegramUserBot:
|
|||||||
# Handle any errors and notify the user
|
# Handle any errors and notify the user
|
||||||
await processing_message.edit_text("An error occurred while processing your request.")
|
await processing_message.edit_text("An error occurred while processing your request.")
|
||||||
|
|
||||||
|
async def handle_voice_command(self, client: Client, message: Message) -> None:
|
||||||
|
"""
|
||||||
|
Handle the /voice command to convert a voice message to text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client (Client): The Pyrogram Client instance.
|
||||||
|
message (Message): The incoming message containing the /voice command.
|
||||||
|
"""
|
||||||
|
self.logger.info(f"Received /voice command from chat_id={message.chat.id}.")
|
||||||
|
|
||||||
|
# Check if the reply is to a voice message
|
||||||
|
if not (message.reply_to_message and message.reply_to_message.voice):
|
||||||
|
self.logger.warning("The /voice command was not used in reply to a voice message.")
|
||||||
|
await message.reply("Please reply to a voice message with the /voice command.", quote=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Send an initial message indicating processing
|
||||||
|
processing_message: Message = await message.reply_to_message.reply("Converting voice message to text...", quote=True)
|
||||||
|
|
||||||
|
with NamedTemporaryFile(delete=False) as temp_file:
|
||||||
|
file_path = await client.download_media(message.reply_to_message.voice.file_id, file_name=temp_file.name)
|
||||||
|
self.logger.info(f"Voice message downloaded to {file_path}.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Attempt to convert voice to text
|
||||||
|
text: str = speech_recognition.convert_voice_to_text(file_path) # type: ignore
|
||||||
|
self.logger.info("Voice message successfully converted to text.")
|
||||||
|
|
||||||
|
# Format the text for sending
|
||||||
|
formatted_text: str = (
|
||||||
|
"<b>Conversion Result:</b>"
|
||||||
|
"<pre>"
|
||||||
|
f"{text}"
|
||||||
|
"</pre>"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Edit the initial processing message with the converted text
|
||||||
|
await processing_message.edit_text(formatted_text)
|
||||||
|
except FileNotFoundError:
|
||||||
|
self.logger.error("File not found during processing.", exc_info=True)
|
||||||
|
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||||
|
except RuntimeError:
|
||||||
|
self.logger.error("A runtime error occurred.", exc_info=True)
|
||||||
|
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||||
|
except Exception:
|
||||||
|
self.logger.error("An unexpected error occurred.", exc_info=True)
|
||||||
|
await processing_message.edit_text("An error occurred while processing the voice message. Please try again later.")
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
"""
|
"""
|
||||||
Starts the bot.
|
Starts the bot.
|
||||||
|
107
src/utils/speech_recognition.py
Normal file
107
src/utils/speech_recognition.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from logging import Logger
|
||||||
|
|
||||||
|
from pydub import AudioSegment
|
||||||
|
import speech_recognition as sr
|
||||||
|
from speech_recognition.audio import AudioData
|
||||||
|
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logger: Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def convert_to_wav(file_path: str) -> str:
|
||||||
|
"""
|
||||||
|
Converts an audio file to WAV format if it is not already in WAV format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): The path to the audio file to be converted.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The path to the converted or original WAV file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If the conversion fails for any reason.
|
||||||
|
"""
|
||||||
|
if file_path.lower().endswith('.wav'):
|
||||||
|
logger.info(f"File {file_path} is already in WAV format.")
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Converting {file_path} to WAV format.")
|
||||||
|
audio = AudioSegment.from_file(file_path)
|
||||||
|
wav_path: str = f"{os.path.splitext(file_path)[0]}.wav"
|
||||||
|
audio.export(wav_path, format="wav")
|
||||||
|
logger.info(f"File converted to {wav_path}.")
|
||||||
|
return wav_path
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to convert file to WAV: {e}")
|
||||||
|
raise RuntimeError(f"Failed to convert file to WAV: {e}")
|
||||||
|
|
||||||
|
def get_audio_duration(file_path: str) -> float:
|
||||||
|
"""
|
||||||
|
Retrieves the duration of an audio file in seconds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): The path to the audio file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: The duration of the audio file in seconds.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If unable to get the file duration.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Getting duration of {file_path}.")
|
||||||
|
audio = AudioSegment.from_file(file_path)
|
||||||
|
duration: float = len(audio) / 1000 # Duration in seconds
|
||||||
|
logger.info(f"Duration of {file_path}: {duration} seconds.")
|
||||||
|
return duration
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get file duration: {e}")
|
||||||
|
raise RuntimeError(f"Failed to get file duration: {e}")
|
||||||
|
|
||||||
|
def convert_voice_to_text(file_path: str, language='ru') -> str:
|
||||||
|
"""
|
||||||
|
Converts speech from an audio file to text using OpenAI speech recognition service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): The path to the audio file to be processed.
|
||||||
|
language (str): The language code for speech recognition (default is 'ru').
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The transcribed text if recognition is successful.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: For any errors encountered during processing.
|
||||||
|
"""
|
||||||
|
# Check if the file exists
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
logger.error(f"File {file_path} does not exist.")
|
||||||
|
raise FileNotFoundError("File does not exist.")
|
||||||
|
|
||||||
|
# Convert the file to WAV format if necessary
|
||||||
|
try:
|
||||||
|
wav_path: str = convert_to_wav(file_path)
|
||||||
|
except RuntimeError as e:
|
||||||
|
logger.error(f"Error converting to WAV: {e}")
|
||||||
|
raise RuntimeError(f"Error converting to WAV: {e}")
|
||||||
|
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Processing file {wav_path} ({get_audio_duration(wav_path)} sec) for speech recognition.")
|
||||||
|
with sr.AudioFile(wav_path) as source:
|
||||||
|
audio_data: AudioData = recognizer.record(source)
|
||||||
|
text = recognizer.recognize_whisper(audio_data, language=language, model='medium')
|
||||||
|
logger.info("Speech recognition successful.")
|
||||||
|
return text # type: ignore
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
logger.warning(f"Speech in {wav_path} could not be recognized.")
|
||||||
|
raise RuntimeError("Speech could not be recognized.")
|
||||||
|
except sr.RequestError as e:
|
||||||
|
logger.error(f"Request error from the recognition service: {e}")
|
||||||
|
raise RuntimeError(f"Request error from the recognition service: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"An unexpected error occurred: {e}")
|
||||||
|
raise RuntimeError(f"An unexpected error occurred: {e}")
|
Loading…
Reference in New Issue
Block a user