From d38fc6104398810fe50282b56bebca043bdca34f Mon Sep 17 00:00:00 2001 From: wanggang Date: Tue, 7 Mar 2023 14:29:59 +0800 Subject: [PATCH 1/8] [voice] add google voice support --- .gitignore | 1 + README.md | 11 +++++++++- bridge/bridge.py | 4 ++++ channel/channel.py | 5 ++++- channel/wechat/wechat_channel.py | 34 ++++++++++++++++++++++++++----- config-template.json | 1 + voice/google/google_voice.py | 21 +++++++++++++++++++ voice/voice.py | 10 +++++++++ voice/voice_factory.py | 17 ++++++++++++++++ voice/xfyun/xfyun_voice.py | 35 ++++++++++++++++++++++++++++++++ 10 files changed, 132 insertions(+), 7 deletions(-) create mode 100644 voice/google/google_voice.py create mode 100644 voice/voice.py create mode 100644 voice/voice_factory.py create mode 100644 voice/xfyun/xfyun_voice.py diff --git a/.gitignore b/.gitignore index c4d7bdc47..8bc62f302 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv* config.json QR.png nohup.out +tmp diff --git a/README.md b/README.md index a75037821..4ff7ceaec 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,12 @@ cd chatgpt-on-wechat/ ```bash pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai + +如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg +pip3 install speech_recognition +--在MacOS中安装ffmpeg,brew install ffmpeg +--在Windows中安装ffmpeg,下载ffmpeg.exe +--在Linux中安装ffmpeg,apt-get install ffmpeg ``` 注:`itchat-uos`使用指定版本1.5.0.dev0,`openai`使用最新版本,需高于0.27.0。 @@ -112,7 +118,10 @@ cp config-template.json config.json + 默认只要被人 @ 就会触发机器人自动回复;另外群聊天中只要检测到以 "@bot" 开头的内容,同样会自动回复(方便自己触发),这对应配置项 `group_chat_prefix` + 可选配置: `group_name_keyword_white_list`配置项支持模糊匹配群名称,`group_chat_keyword`配置项则支持模糊匹配群消息内容,用法与上述两个配置项相同。(Contributed by [evolay](https://github.com/evolay)) -**3.其他配置** +**3.语音识别** ++ 配置`speech_recognition=true`开启语音识别 + +**4.其他配置** + `proxy`:由于目前 `openai` 接口国内无法访问,需配置代理客户端的地址,详情参考 [#351](https://github.com/zhayujie/chatgpt-on-wechat/issues/351) + 对于图像生成,在满足个人或群组触发条件外,还需要额外的关键词前缀来触发,对应配置 `image_create_prefix ` diff --git a/bridge/bridge.py b/bridge/bridge.py index 6c164e87b..78d950ab9 100644 --- a/bridge/bridge.py +++ b/bridge/bridge.py @@ -1,4 +1,5 @@ from bot import bot_factory +from voice import voice_factory class Bridge(object): @@ -7,3 +8,6 @@ def __init__(self): def fetch_reply_content(self, query, context): return bot_factory.create_bot("chatGPT").reply(query, context) + + def fetch_voice_to_text(self, voiceFile): + return voice_factory.create_voice("google").voiceToText(voiceFile) diff --git a/channel/channel.py b/channel/channel.py index e2617d1f3..d4c0fc5a7 100644 --- a/channel/channel.py +++ b/channel/channel.py @@ -11,7 +11,7 @@ def startup(self): """ raise NotImplementedError - def handle(self, msg): + def handle_text(self, msg): """ process received msg :param msg: message object @@ -29,3 +29,6 @@ def send(self, msg, receiver): def build_reply_content(self, query, context=None): return Bridge().fetch_reply_content(query, context) + + def build_void_text(self, voice_file): + return Bridge().fetch_voice_to_text(voice_file) diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index 66778f415..b7730108e 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -3,6 +3,8 @@ """ wechat channel """ + +import os import itchat import json from itchat.content import * @@ -18,7 +20,7 @@ @itchat.msg_register(TEXT) def handler_single_msg(msg): - WechatChannel().handle(msg) + WechatChannel().handle_text(msg) return None @@ -28,9 +30,19 @@ def handler_group_msg(msg): return None +@itchat.msg_register(VOICE) +def handler_single_voice(msg): + WechatChannel().handle_voice(msg) + return None + + class WechatChannel(Channel): + tmpFilePath = './tmp/' + def __init__(self): - pass + isExists = os.path.exists(self.tmpFilePath) + if not isExists: + os.makedirs(self.tmpFilePath) def startup(self): # login by scan QRCode @@ -39,12 +51,24 @@ def startup(self): # start message listener itchat.run() - def handle(self, msg): - logger.debug("[WX]receive msg: " + json.dumps(msg, ensure_ascii=False)) + def handle_voice(self, msg): + if conf().get('speech_recognition') != True : + return + logger.debug("[WX]receive voice msg: ", msg['FileName']) + fileName = msg['FileName'] + msg.download(self.tmpFilePath+fileName) + content = super().build_void_text(self.tmpFilePath+fileName) + self._handle_single_msg(msg, content) + + def handle_text(self, msg): + logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False)) + content = msg['Text'] + self._handle_single_msg(msg, content) + + def _handle_single_msg(self, msg, content): from_user_id = msg['FromUserName'] to_user_id = msg['ToUserName'] # 接收人id other_user_id = msg['User']['UserName'] # 对手方id - content = msg['Text'] match_prefix = self.check_prefix(content, conf().get('single_chat_prefix')) if "」\n- - - - - - - - - - - - - - -" in content: logger.debug("[WX]reference query skipped") diff --git a/config-template.json b/config-template.json index fd6d46a4e..9ad9f5d6f 100644 --- a/config-template.json +++ b/config-template.json @@ -7,6 +7,7 @@ "group_name_white_list": ["ChatGPT测试群", "ChatGPT测试群2"], "image_create_prefix": ["画", "看", "找"], "conversation_max_tokens": 1000, + "speech_recognition": false, "character_desc": "你是ChatGPT, 一个由OpenAI训练的大型语言模型, 你旨在回答并解决人们的任何问题,并且可以使用多种语言与人交流。", "expires_in_seconds": 3600 } diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py new file mode 100644 index 000000000..7af388054 --- /dev/null +++ b/voice/google/google_voice.py @@ -0,0 +1,21 @@ + +""" +google voice service +""" + +import subprocess +import speech_recognition +from voice.voice import Voice + +class GoogleVoice(Voice): + recognizer = speech_recognition.Recognizer() + + def __init__(self): + pass + + def voiceToText(self, voice_file): + new_file = voice_file.replace('.mp3', '.wav') + subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True) + with speech_recognition.AudioFile(new_file) as source: + audio = self.recognizer.record(source) + return self.recognizer.recognize_google(audio, language='zh-CN') diff --git a/voice/voice.py b/voice/voice.py new file mode 100644 index 000000000..2f66dab20 --- /dev/null +++ b/voice/voice.py @@ -0,0 +1,10 @@ +""" +Voice service abstract class +""" + +class Voice(object): + def voiceToText(self, voice_file): + """ + Send voice to voice service and get text + """ + raise NotImplementedError diff --git a/voice/voice_factory.py b/voice/voice_factory.py new file mode 100644 index 000000000..5457d147d --- /dev/null +++ b/voice/voice_factory.py @@ -0,0 +1,17 @@ +""" +voice factory +""" + +def create_voice(voice_type): + """ + create a voice instance + :param voice_type: voice type code + :return: voice instance + """ + if voice_type == 'xfyun': + from voice.xfyun.xfyun_voice import XfyunVoice + return XfyunVoice() + elif voice_type == 'google': + from voice.google.google_voice import GoogleVoice + return GoogleVoice() + raise RuntimeError diff --git a/voice/xfyun/xfyun_voice.py b/voice/xfyun/xfyun_voice.py new file mode 100644 index 000000000..74b27b202 --- /dev/null +++ b/voice/xfyun/xfyun_voice.py @@ -0,0 +1,35 @@ + +""" +科大讯飞 voice service +""" + +from voice.voice import Voice + +# 科大讯飞语音识别 +lfasr_host = 'http://raasr.xfyun.cn/api' +# 请求的接口名 +api_prepare = '/prepare' +api_upload = '/upload' +api_merge = '/merge' +api_get_progress = '/getProgress' +api_get_result = '/getResult' +# 文件分片大小10M +file_piece_sice = 10485760 +# ——————————————————转写可配置参数———————————————— +# 参数可在官网界面(https://doc.xfyun.cn/rest_api/%E8%AF%AD%E9%9F%B3%E8%BD%AC%E5%86%99.html)查看,根据需求可自行在gene_params方法里添加修改 +# 转写类型 +lfasr_type = 0 +# 是否开启分词 +has_participle = 'false' +has_seperate = 'true' +# 多候选词个数 +max_alternatives = 0 +# 子用户标识 +suid = '' + +class XfyunVoice(Voice): + def __init__(self): + pass + + def voiceToText(self, voice_file): + pass \ No newline at end of file From 1711a5c0640163aa11b6bbd2a4571a71cc508900 Mon Sep 17 00:00:00 2001 From: wanggang Date: Tue, 7 Mar 2023 14:42:06 +0800 Subject: [PATCH 2/8] [voice] fix google voice exception issue --- README.md | 2 +- voice/google/google_voice.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ff7ceaec..54fbf831e 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai 如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg -pip3 install speech_recognition +pip3 install SpeechRecognition --在MacOS中安装ffmpeg,brew install ffmpeg --在Windows中安装ffmpeg,下载ffmpeg.exe --在Linux中安装ffmpeg,apt-get install ffmpeg diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py index 7af388054..97597b39e 100644 --- a/voice/google/google_voice.py +++ b/voice/google/google_voice.py @@ -18,4 +18,9 @@ def voiceToText(self, voice_file): subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True) with speech_recognition.AudioFile(new_file) as source: audio = self.recognizer.record(source) - return self.recognizer.recognize_google(audio, language='zh-CN') + try: + return self.recognizer.recognize_google(audio, language='zh-CN') + except speech_recognition.UnknownValueError: + return "抱歉,我听不懂。" + except speech_recognition.RequestError as e: + return "抱歉,无法连接到 Google 语音识别服务;{0}".format(e) From cc19017c01c7e8b7be7eb6363d64e7865826f6b8 Mon Sep 17 00:00:00 2001 From: wanggang Date: Tue, 7 Mar 2023 23:28:57 +0800 Subject: [PATCH 3/8] [voice] add text to voice --- bridge/bridge.py | 3 ++ channel/channel.py | 5 +++- channel/wechat/wechat_channel.py | 47 +++++++++++++++++++++++--------- voice/google/google_voice.py | 32 +++++++++++++++++++--- voice/voice.py | 6 ++++ 5 files changed, 75 insertions(+), 18 deletions(-) diff --git a/bridge/bridge.py b/bridge/bridge.py index 78d950ab9..9d00bfedc 100644 --- a/bridge/bridge.py +++ b/bridge/bridge.py @@ -11,3 +11,6 @@ def fetch_reply_content(self, query, context): def fetch_voice_to_text(self, voiceFile): return voice_factory.create_voice("google").voiceToText(voiceFile) + + def fetch_text_to_voice(self, text): + return voice_factory.create_voice("google").textToVoice(text) \ No newline at end of file diff --git a/channel/channel.py b/channel/channel.py index d4c0fc5a7..a1395c49a 100644 --- a/channel/channel.py +++ b/channel/channel.py @@ -30,5 +30,8 @@ def send(self, msg, receiver): def build_reply_content(self, query, context=None): return Bridge().fetch_reply_content(query, context) - def build_void_text(self, voice_file): + def build_voice_to_text(self, voice_file): return Bridge().fetch_voice_to_text(voice_file) + + def build_text_to_voice(self, text): + return Bridge().fetch_text_to_voice(text) diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index b7730108e..b3d36582d 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -40,6 +40,7 @@ class WechatChannel(Channel): tmpFilePath = './tmp/' def __init__(self): + voices = self.engine.getProperty('voices') isExists = os.path.exists(self.tmpFilePath) if not isExists: os.makedirs(self.tmpFilePath) @@ -55,17 +56,20 @@ def handle_voice(self, msg): if conf().get('speech_recognition') != True : return logger.debug("[WX]receive voice msg: ", msg['FileName']) - fileName = msg['FileName'] - msg.download(self.tmpFilePath+fileName) - content = super().build_void_text(self.tmpFilePath+fileName) - self._handle_single_msg(msg, content) + thread_pool.submit(self._do_handle_voice, msg) + + def _do_handle_voice(self, msg): + fileName = self.tmpFilePath+msg['FileName'] + msg.download(fileName) + content = super().build_voice_to_text(fileName) + self._handle_single_msg(msg, content, True) def handle_text(self, msg): logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False)) content = msg['Text'] - self._handle_single_msg(msg, content) + self._handle_single_msg(msg, content, False) - def _handle_single_msg(self, msg, content): + def _handle_single_msg(self, msg, content, is_voice): from_user_id = msg['FromUserName'] to_user_id = msg['ToUserName'] # 接收人id other_user_id = msg['User']['UserName'] # 对手方id @@ -84,9 +88,10 @@ def _handle_single_msg(self, msg, content): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, from_user_id) - else: - thread_pool.submit(self._do_send, content, from_user_id) - + elif is_voice: + thread_pool.submit(self._do_send_voice, content, from_user_id) + else : + thread_pool.submit(self._do_send_text, content, from_user_id) elif to_user_id == other_user_id and match_prefix: # 自己给好友发送消息 str_list = content.split(match_prefix, 1) @@ -96,8 +101,10 @@ def _handle_single_msg(self, msg, content): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, to_user_id) + elif is_voice: + thread_pool.submit(self._do_send_voice, content, to_user_id) else: - thread_pool.submit(self._do_send, content, to_user_id) + thread_pool.submit(self._do_send_text, content, to_user_id) def handle_group(self, msg): @@ -129,10 +136,24 @@ def handle_group(self, msg): thread_pool.submit(self._do_send_group, content, msg) def send(self, msg, receiver): - logger.info('[WX] sendMsg={}, receiver={}'.format(msg, receiver)) itchat.send(msg, toUserName=receiver) + logger.info('[WX] sendMsg={}, receiver={}'.format(msg, receiver)) - def _do_send(self, query, reply_user_id): + def _do_send_voice(self, query, reply_user_id): + try: + if not query: + return + context = dict() + context['from_user_id'] = reply_user_id + reply_text = super().build_reply_content(query, context) + if reply_text: + replyFile = super().build_text_to_voice(reply_text) + itchat.send_file(replyFile, toUserName=reply_user_id) + logger.info('[WX] sendFile={}, receiver={}'.format(replyFile, reply_user_id)) + except Exception as e: + logger.exception(e) + + def _do_send_text(self, query, reply_user_id): try: if not query: return @@ -162,8 +183,8 @@ def _do_send_img(self, query, reply_user_id): image_storage.seek(0) # 图片发送 - logger.info('[WX] sendImage, receiver={}'.format(reply_user_id)) itchat.send_image(image_storage, reply_user_id) + logger.info('[WX] sendImage, receiver={}'.format(reply_user_id)) except Exception as e: logger.exception(e) diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py index 97597b39e..58955f450 100644 --- a/voice/google/google_voice.py +++ b/voice/google/google_voice.py @@ -4,23 +4,47 @@ """ import subprocess -import speech_recognition +import time +import speech_recognition +import pyttsx3 +from common.log import logger from voice.voice import Voice + class GoogleVoice(Voice): + tmpFilePath = './tmp/' recognizer = speech_recognition.Recognizer() + engine = pyttsx3.init() def __init__(self): - pass + # 语速 + self.engine.setProperty('rate', 125) + # 音量 + self.engine.setProperty('volume', 1.0) + # 0为男声,1为女声 + voices = self.engine.getProperty('voices') + self.engine.setProperty('voice', voices[1].id) def voiceToText(self, voice_file): new_file = voice_file.replace('.mp3', '.wav') - subprocess.call('ffmpeg -i ' + voice_file + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True) + subprocess.call('ffmpeg -i ' + voice_file + + ' -acodec pcm_s16le -ac 1 -ar 16000 ' + new_file, shell=True) with speech_recognition.AudioFile(new_file) as source: audio = self.recognizer.record(source) try: - return self.recognizer.recognize_google(audio, language='zh-CN') + text = self.recognizer.recognize_google(audio, language='zh-CN') + logger.info( + '[Google] voiceToText text={} voice file name={}'.format(text, voice_file)) + return text except speech_recognition.UnknownValueError: return "抱歉,我听不懂。" except speech_recognition.RequestError as e: return "抱歉,无法连接到 Google 语音识别服务;{0}".format(e) + + def textToVoice(self, text): + textFile = self.tmpFilePath + '语音回复_' + str(int(time.time())) + '.mp3' + self.engine.save_to_file(text, textFile) + self.engine.runAndWait() + logger.info( + '[Google] textToVoice text={} voice file name={}'.format(text, textFile)) + return textFile diff --git a/voice/voice.py b/voice/voice.py index 2f66dab20..52d8aaa52 100644 --- a/voice/voice.py +++ b/voice/voice.py @@ -8,3 +8,9 @@ def voiceToText(self, voice_file): Send voice to voice service and get text """ raise NotImplementedError + + def textToVoice(self, text): + """ + Send text to voice service and get voice + """ + raise NotImplementedError \ No newline at end of file From 720ad07f83cbfa73dfa27c90849073f7a615fa7c Mon Sep 17 00:00:00 2001 From: wanggang Date: Tue, 7 Mar 2023 23:33:25 +0800 Subject: [PATCH 4/8] [voice] fix issue --- README.md | 6 +++--- channel/wechat/wechat_channel.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 54fbf831e..1676395c0 100644 --- a/README.md +++ b/README.md @@ -72,11 +72,11 @@ cd chatgpt-on-wechat/ pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai -如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg +如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg和espeak pip3 install SpeechRecognition ---在MacOS中安装ffmpeg,brew install ffmpeg +--在MacOS中安装ffmpeg,brew install ffmpeg espeak --在Windows中安装ffmpeg,下载ffmpeg.exe ---在Linux中安装ffmpeg,apt-get install ffmpeg +--在Linux中安装ffmpeg,apt-get install ffmpeg espeak ``` 注:`itchat-uos`使用指定版本1.5.0.dev0,`openai`使用最新版本,需高于0.27.0。 diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index b3d36582d..3fdc94f89 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -40,7 +40,6 @@ class WechatChannel(Channel): tmpFilePath = './tmp/' def __init__(self): - voices = self.engine.getProperty('voices') isExists = os.path.exists(self.tmpFilePath) if not isExists: os.makedirs(self.tmpFilePath) From 882e6c35762bd805a6bf9a320f78ee5fa8ec7362 Mon Sep 17 00:00:00 2001 From: wanggang Date: Wed, 8 Mar 2023 11:02:01 +0800 Subject: [PATCH 5/8] [voice] add support for wispper --- bridge/bridge.py | 4 ++-- channel/wechat/wechat_channel.py | 4 ++-- config-template.json | 3 +++ voice/baidu/baidu_voice.py | 22 ++++++++++++++++++++ voice/openai/openai_voice.py | 25 +++++++++++++++++++++++ voice/voice_factory.py | 9 +++++--- voice/xfyun/xfyun_voice.py | 35 -------------------------------- 7 files changed, 60 insertions(+), 42 deletions(-) create mode 100644 voice/baidu/baidu_voice.py create mode 100644 voice/openai/openai_voice.py delete mode 100644 voice/xfyun/xfyun_voice.py diff --git a/bridge/bridge.py b/bridge/bridge.py index 9d00bfedc..e739a7f2b 100644 --- a/bridge/bridge.py +++ b/bridge/bridge.py @@ -10,7 +10,7 @@ def fetch_reply_content(self, query, context): return bot_factory.create_bot("chatGPT").reply(query, context) def fetch_voice_to_text(self, voiceFile): - return voice_factory.create_voice("google").voiceToText(voiceFile) + return voice_factory.create_voice("openai").voiceToText(voiceFile) def fetch_text_to_voice(self, text): - return voice_factory.create_voice("google").textToVoice(text) \ No newline at end of file + return voice_factory.create_voice("baidu").textToVoice(text) \ No newline at end of file diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index 3fdc94f89..22824559d 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -54,14 +54,14 @@ def startup(self): def handle_voice(self, msg): if conf().get('speech_recognition') != True : return - logger.debug("[WX]receive voice msg: ", msg['FileName']) + logger.debug("[WX]receive voice msg: " + msg['FileName']) thread_pool.submit(self._do_handle_voice, msg) def _do_handle_voice(self, msg): fileName = self.tmpFilePath+msg['FileName'] msg.download(fileName) content = super().build_voice_to_text(fileName) - self._handle_single_msg(msg, content, True) + self._handle_single_msg(msg, content, False) def handle_text(self, msg): logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False)) diff --git a/config-template.json b/config-template.json index 9ad9f5d6f..f7549d470 100644 --- a/config-template.json +++ b/config-template.json @@ -8,6 +8,9 @@ "image_create_prefix": ["画", "看", "找"], "conversation_max_tokens": 1000, "speech_recognition": false, + "baidu_app_id": "YOUR BAIDU APP ID", + "baidu_api_key": "YOUR BAIDU API KEY", + "baidu_secret_key": "YOUR BAIDU SERVICE KEY", "character_desc": "你是ChatGPT, 一个由OpenAI训练的大型语言模型, 你旨在回答并解决人们的任何问题,并且可以使用多种语言与人交流。", "expires_in_seconds": 3600 } diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py new file mode 100644 index 000000000..8534c2ba1 --- /dev/null +++ b/voice/baidu/baidu_voice.py @@ -0,0 +1,22 @@ + +""" +baidu voice service +""" +from aip import AipSpeech +from voice.voice import Voice +from config import conf + +class BaiduVoice(Voice): + APP_ID = conf().get('baidu_app_id') + API_KEY = conf().get('baidu_api_key') + SECRET_KEY = conf().get('baidu_secret_key') + client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) + + def __init__(self): + pass + + def voiceToText(self, voice_file): + pass + + def textToVoice(self, text): + pass diff --git a/voice/openai/openai_voice.py b/voice/openai/openai_voice.py new file mode 100644 index 000000000..8cc28b76c --- /dev/null +++ b/voice/openai/openai_voice.py @@ -0,0 +1,25 @@ + +""" +google voice service +""" +import json +import openai +from common.log import logger +from voice.voice import Voice + + +class OpenaiVoice(Voice): + def __init__(self): + pass + + def voiceToText(self, voice_file): + file = open(voice_file, "rb") + reply = openai.Audio.transcribe("whisper-1", file) + json_dict = json.loads(reply) + text = json_dict['text'] + logger.info( + '[Openai] voiceToText text={} voice file name={}'.format(text, voice_file)) + return text + + def textToVoice(self, text): + pass diff --git a/voice/voice_factory.py b/voice/voice_factory.py index 5457d147d..053840e79 100644 --- a/voice/voice_factory.py +++ b/voice/voice_factory.py @@ -8,10 +8,13 @@ def create_voice(voice_type): :param voice_type: voice type code :return: voice instance """ - if voice_type == 'xfyun': - from voice.xfyun.xfyun_voice import XfyunVoice - return XfyunVoice() + if voice_type == 'baidu': + from voice.baidu.baidu_voice import BaiduVoice + return BaiduVoice() elif voice_type == 'google': from voice.google.google_voice import GoogleVoice return GoogleVoice() + elif voice_type == 'openai': + from voice.openai.openai_voice import OpenaiVoice + return OpenaiVoice() raise RuntimeError diff --git a/voice/xfyun/xfyun_voice.py b/voice/xfyun/xfyun_voice.py deleted file mode 100644 index 74b27b202..000000000 --- a/voice/xfyun/xfyun_voice.py +++ /dev/null @@ -1,35 +0,0 @@ - -""" -科大讯飞 voice service -""" - -from voice.voice import Voice - -# 科大讯飞语音识别 -lfasr_host = 'http://raasr.xfyun.cn/api' -# 请求的接口名 -api_prepare = '/prepare' -api_upload = '/upload' -api_merge = '/merge' -api_get_progress = '/getProgress' -api_get_result = '/getResult' -# 文件分片大小10M -file_piece_sice = 10485760 -# ——————————————————转写可配置参数———————————————— -# 参数可在官网界面(https://doc.xfyun.cn/rest_api/%E8%AF%AD%E9%9F%B3%E8%BD%AC%E5%86%99.html)查看,根据需求可自行在gene_params方法里添加修改 -# 转写类型 -lfasr_type = 0 -# 是否开启分词 -has_participle = 'false' -has_seperate = 'true' -# 多候选词个数 -max_alternatives = 0 -# 子用户标识 -suid = '' - -class XfyunVoice(Voice): - def __init__(self): - pass - - def voiceToText(self, voice_file): - pass \ No newline at end of file From d7a8854fa14fe075e59b363159bfb83c84403b4a Mon Sep 17 00:00:00 2001 From: wanggang Date: Wed, 8 Mar 2023 11:32:27 +0800 Subject: [PATCH 6/8] [voice] add support for whisper-1 model --- README.md | 3 +++ channel/wechat/wechat_channel.py | 7 ++++--- voice/google/google_voice.py | 2 +- voice/openai/openai_voice.py | 8 +++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1676395c0..93660e861 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,9 @@ cd chatgpt-on-wechat/ pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai +默认使用openai的whisper-1模型 +如果使用百度的语音识别,需要安装百度的pythonSDK +pip3 install baidu-aip 如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg和espeak pip3 install SpeechRecognition --在MacOS中安装ffmpeg,brew install ffmpeg espeak diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index 22824559d..0f2061394 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -5,6 +5,7 @@ """ import os +import pathlib import itchat import json from itchat.content import * @@ -37,11 +38,11 @@ def handler_single_voice(msg): class WechatChannel(Channel): - tmpFilePath = './tmp/' + tmpFilePath = pathlib.Path('./tmp/') def __init__(self): - isExists = os.path.exists(self.tmpFilePath) - if not isExists: + pathExists = os.path.exists(self.tmpFilePath) + if not pathExists and conf().get('speech_recognition') == True: os.makedirs(self.tmpFilePath) def startup(self): diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py index 58955f450..3fff9d7e6 100644 --- a/voice/google/google_voice.py +++ b/voice/google/google_voice.py @@ -3,6 +3,7 @@ google voice service """ +import pathlib import subprocess import time import speech_recognition @@ -12,7 +13,6 @@ class GoogleVoice(Voice): - tmpFilePath = './tmp/' recognizer = speech_recognition.Recognizer() engine = pyttsx3.init() diff --git a/voice/openai/openai_voice.py b/voice/openai/openai_voice.py index 8cc28b76c..475aac673 100644 --- a/voice/openai/openai_voice.py +++ b/voice/openai/openai_voice.py @@ -4,19 +4,21 @@ """ import json import openai +from config import conf from common.log import logger from voice.voice import Voice class OpenaiVoice(Voice): def __init__(self): - pass + openai.api_key = conf().get('open_ai_api_key') def voiceToText(self, voice_file): + logger.debug( + '[Openai] voice file name={}'.format(voice_file)) file = open(voice_file, "rb") reply = openai.Audio.transcribe("whisper-1", file) - json_dict = json.loads(reply) - text = json_dict['text'] + text = reply["text"] logger.info( '[Openai] voiceToText text={} voice file name={}'.format(text, voice_file)) return text From 3db452ef71940d8cc0d0c4b76ab08955b0738d2a Mon Sep 17 00:00:00 2001 From: wanggang Date: Wed, 8 Mar 2023 15:22:46 +0800 Subject: [PATCH 7/8] [voice] using baidu service to gen reply voice --- README.md | 6 +++--- channel/wechat/wechat_channel.py | 19 +++++++------------ common/tmp_dir.py | 20 ++++++++++++++++++++ config-template.json | 1 + voice/baidu/baidu_voice.py | 16 +++++++++++++++- voice/google/google_voice.py | 3 ++- 6 files changed, 48 insertions(+), 17 deletions(-) create mode 100644 common/tmp_dir.py diff --git a/README.md b/README.md index 93660e861..8fe3b30d9 100644 --- a/README.md +++ b/README.md @@ -72,9 +72,8 @@ cd chatgpt-on-wechat/ pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai -默认使用openai的whisper-1模型 如果使用百度的语音识别,需要安装百度的pythonSDK -pip3 install baidu-aip +pip3 install baidu-aip chardet 如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg和espeak pip3 install SpeechRecognition --在MacOS中安装ffmpeg,brew install ffmpeg espeak @@ -122,7 +121,8 @@ cp config-template.json config.json + 可选配置: `group_name_keyword_white_list`配置项支持模糊匹配群名称,`group_chat_keyword`配置项则支持模糊匹配群消息内容,用法与上述两个配置项相同。(Contributed by [evolay](https://github.com/evolay)) **3.语音识别** -+ 配置`speech_recognition=true`开启语音识别 ++ 配置`speech_recognition=true`开启语音识别,默认使用openai的whisper模型 ++ 配置`voice_reply_voice=true`语音回复语音,但是需要配置对应语音合成平台的key **4.其他配置** diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index 0f2061394..b861e358c 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -4,14 +4,13 @@ wechat channel """ -import os -import pathlib import itchat import json from itchat.content import * from channel.channel import Channel from concurrent.futures import ThreadPoolExecutor from common.log import logger +from common.tmp_dir import TmpDir from config import conf import requests import io @@ -38,12 +37,8 @@ def handler_single_voice(msg): class WechatChannel(Channel): - tmpFilePath = pathlib.Path('./tmp/') - def __init__(self): - pathExists = os.path.exists(self.tmpFilePath) - if not pathExists and conf().get('speech_recognition') == True: - os.makedirs(self.tmpFilePath) + pass def startup(self): # login by scan QRCode @@ -59,17 +54,17 @@ def handle_voice(self, msg): thread_pool.submit(self._do_handle_voice, msg) def _do_handle_voice(self, msg): - fileName = self.tmpFilePath+msg['FileName'] + fileName = TmpDir().path() + msg['FileName'] msg.download(fileName) content = super().build_voice_to_text(fileName) - self._handle_single_msg(msg, content, False) + self._handle_single_msg(msg, content, conf().get('voice_reply_voice')) def handle_text(self, msg): logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False)) content = msg['Text'] self._handle_single_msg(msg, content, False) - def _handle_single_msg(self, msg, content, is_voice): + def _handle_single_msg(self, msg, content, reply_voice=False): from_user_id = msg['FromUserName'] to_user_id = msg['ToUserName'] # 接收人id other_user_id = msg['User']['UserName'] # 对手方id @@ -88,7 +83,7 @@ def _handle_single_msg(self, msg, content, is_voice): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, from_user_id) - elif is_voice: + elif reply_voice: thread_pool.submit(self._do_send_voice, content, from_user_id) else : thread_pool.submit(self._do_send_text, content, from_user_id) @@ -101,7 +96,7 @@ def _handle_single_msg(self, msg, content, is_voice): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, to_user_id) - elif is_voice: + elif reply_voice: thread_pool.submit(self._do_send_voice, content, to_user_id) else: thread_pool.submit(self._do_send_text, content, to_user_id) diff --git a/common/tmp_dir.py b/common/tmp_dir.py new file mode 100644 index 000000000..1738022ca --- /dev/null +++ b/common/tmp_dir.py @@ -0,0 +1,20 @@ + +import os +import pathlib +from config import conf + + +class TmpDir(object): + """A temporary directory that is deleted when the object is destroyed. + """ + + tmpFilePath = pathlib.Path('./tmp/') + + def __init__(self): + pathExists = os.path.exists(self.tmpFilePath) + if not pathExists and conf().get('speech_recognition') == True: + os.makedirs(self.tmpFilePath) + + def path(self): + return str(self.tmpFilePath) + '/' + \ No newline at end of file diff --git a/config-template.json b/config-template.json index f7549d470..7e693f6f9 100644 --- a/config-template.json +++ b/config-template.json @@ -8,6 +8,7 @@ "image_create_prefix": ["画", "看", "找"], "conversation_max_tokens": 1000, "speech_recognition": false, + "voice_reply_voice": false, "baidu_app_id": "YOUR BAIDU APP ID", "baidu_api_key": "YOUR BAIDU API KEY", "baidu_secret_key": "YOUR BAIDU SERVICE KEY", diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py index 8534c2ba1..d99db37dc 100644 --- a/voice/baidu/baidu_voice.py +++ b/voice/baidu/baidu_voice.py @@ -2,7 +2,10 @@ """ baidu voice service """ +import time from aip import AipSpeech +from common.log import logger +from common.tmp_dir import TmpDir from voice.voice import Voice from config import conf @@ -19,4 +22,15 @@ def voiceToText(self, voice_file): pass def textToVoice(self, text): - pass + result = self.client.synthesis(text, 'zh', 1, { + 'spd': 5, 'pit': 5, 'vol': 5, 'per': 111 + }) + if not isinstance(result, dict): + fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3' + with open(fileName, 'wb') as f: + f.write(result) + logger.info('[Baidu] textToVoice text={} voice file name={}'.format(text, fileName)) + return fileName + else: + logger.error('[Baidu] textToVoice error={}'.format(result)) + return None diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py index 3fff9d7e6..8e339f2bc 100644 --- a/voice/google/google_voice.py +++ b/voice/google/google_voice.py @@ -9,6 +9,7 @@ import speech_recognition import pyttsx3 from common.log import logger +from common.tmp_dir import TmpDir from voice.voice import Voice @@ -42,7 +43,7 @@ def voiceToText(self, voice_file): return "抱歉,无法连接到 Google 语音识别服务;{0}".format(e) def textToVoice(self, text): - textFile = self.tmpFilePath + '语音回复_' + str(int(time.time())) + '.mp3' + textFile = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3' self.engine.save_to_file(text, textFile) self.engine.runAndWait() logger.info( From d02508df413f222893042805e065667c8a4596f3 Mon Sep 17 00:00:00 2001 From: wanggang Date: Wed, 8 Mar 2023 16:39:25 +0800 Subject: [PATCH 8/8] [voice] Readme modify --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8fe3b30d9..09ca878d1 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ cp config-template.json config.json **3.语音识别** + 配置`speech_recognition=true`开启语音识别,默认使用openai的whisper模型 -+ 配置`voice_reply_voice=true`语音回复语音,但是需要配置对应语音合成平台的key ++ 配置`voice_reply_voice=true`语音回复语音,但是需要配置对应语音合成平台的key,由于itchat协议的限制,只能发送语音mp3文件。使用wechaty则回复的是微信语音。 **4.其他配置**