Raspberrypi ZERO WでChatGPTを使って会話してみる

chatgptを用いて会話できるｽﾀｯｸﾁｬﾝが人気ですが、Cが書けない私は未だｽﾀｯｸﾁｬﾝと会話ができておりません。

裏技でｽﾀｯｸﾁｬﾝにbluetoothモジュールを仕込み、スマホからchatgptと音声会話した内容をスピーカーから鳴らすだけというエセｽﾀｯｸﾁｬﾝは作ったものの、やはり達成感がありません。

と言うことで、pythonならライブラリが豊富にあるのでraspberry piでchatgptと会話できるものを作ってみました。

Twitterとか見ているとみんな簡単にやってのけていますが、個人的には非常に苦労しました(-_-;)

やっぱりTwitterの電子工作界隈って化け物ぞろいですね。

それではさくっとまとめようかと思います。

1 使うもの
2 実現したい内容
3 事前準備
4 プログラム
5 完成

使うもの

・Raspberry Pi ZERRO W

・WM8960 Hi-Fi サウンドカードオーディオ HAT

実現したい内容

・chatgptと連携させた音声会話

・音声が少し途切れても１つの文章として認識させる（継続）

・音声が3秒ほど途切れたら文章の終了とさせる

事前準備

今回micやアンプには「 WM8960 Hi-Fi サウンドカードオーディオ HAT 」という基板を使用しました。

このオーディオ基板を使用するにあたり、ドライバのインストールが必要でしたので初めにそこの部分を乗っけておきます。

下記URLでインストールできます。

WM8960 オーディオ HAT

これで専用のライブラリとかが使えるようになります。

プログラム

import pyaudio
import struct
import math
import audioop
import wave
import time
import os
import io
from google.cloud import speech
import openai
from google.cloud import texttospeech
from gtts import gTTS
import pygame.mixer

pygame.mixer.init()

openai.api_key = ""
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''
# Instantiates a client
client = speech.SpeechClient()
### 音声データを指定
speech_file = 'output.wav'

chunk = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
INPUT_DEVICE_INDEX = 0
THRESHOLD = 700
SILENCE_LIMIT = 3
RECORD_SECONDS = 5
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

def get_rms(block):
    """
    Returns the root mean square of the audio block.
    """
    return audioop.rms(block, 2)

def get_audio_input_device(p, input_device_index, channels=1, rate=44100, frames_per_buffer=1024):
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=rate,
                    input=True,
                    input_device_index=input_device_index,
                    frames_per_buffer=frames_per_buffer)
    return stream

def ask_gpt(text):
    response = openai.Completion.create(
        engine="davinci",  # 使用する言語モデルを指定する
        prompt=text,
        max_tokens=1024,  # 応答の長さを指定する
        n=1,  # 応答の候補数を指定する
        stop=None,  # 応答の終了条件を指定する
    )
    return response.choices[0].text.strip()

# text to speech関数（引数：変換テキスト、対応言語、出力ファイル名）
def text_to_speech(text, language, name):

    # gTTSインスタンスの作成
    text2speech = gTTS(text,           # 音声変換するテキスト
                       lang=language,  # 対応言語（ja：日本語）
                      )

    # 音声変換したデータをファイルに保存
    text2speech.save(name + ".mp3")

    return True

stream = get_audio_input_device(p, INPUT_DEVICE_INDEX, CHANNELS, RATE, chunk)

is_recording = False
silence_counter = 0
frames = []
#frames = [bytes(frame) for frame in frames]
type = "nane"
t_end = time.time()
t_time = 3
text = ""
while True:
    try:
        block = stream.read(chunk, exception_on_overflow=False)
        amplitude = get_rms(block)

        if not is_recording:
            if amplitude > THRESHOLD:
                is_recording = True
                print("Recording started")
                frames = []
        elif is_recording:
            if amplitude > THRESHOLD:
                silence_counter = 0
                # Record audio block
                frames.append(block)
                type = "on"
                print("rokuon")
            else:
                if type == "on":
                    cdown = time.time()
                    #for i in range(0, int(RATE / chunk * RECORD_SECONDS)):
                    while True:
                        if time.time() < cdown + 3 and amplitude < THRESHOLD:   
      #data = stream.read(block)
                            block = stream.read(chunk, exception_on_overflow=False)
                            frames.append(block)
                        #t_end = time.time()
                        #t_time = 3
                        #if time.time() < t_end + t_time:
                            #frames.append(block)
                            #print("kuuhaku")
                        else:
                            #print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
                            break
                    #time.sleep(3)
                    type = "off"
                silence_counter += 1
                if silence_counter > (RATE / chunk) * SILENCE_LIMIT:
                    is_recording = False
                    silence_counter = 0
                    print("Recording stopped")
                    # Save recorded audio to file
                    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
                    wf.setnchannels(CHANNELS)
                    wf.setsampwidth(p.get_sample_size(FORMAT))
                    wf.setframerate(RATE)
                    #print(frames)
                    #frames = [struct.pack('h', s) for s in frames]
                    wf.writeframes(b''.join(frames))
                    wf.close()
                    frames = []
                    #print("nnnnnnnnnnnnaaaaaaaaaaaaaaaaaaasssssssssssssiiiiiiiiii")
                    # 音声ファイルを読み込む
                    with wave.open("output.wav", "rb") as f:
                        frames = f.readframes(f.getnframes())
                        sample_rate = f.getframerate()
                        sample_width = f.getsampwidth()
                        channels = f.getnchannels()
                    print("wait....")
                    ### rb(read binary)でデータを読み込む
                    with io.open(speech_file, 'rb') as f:
                        content = f.read()

                    ### RecognitionAudioにデータを渡す
                    audio = speech.RecognitionAudio(content=content)

                    config = speech.RecognitionConfig(
                        ### encodeでエラーが出たのでENCODING_UNSPECIFIEDに変更
                        encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
                        sample_rate_hertz=44100,
                        language_code="ja-JP",
                    )

                    ### 音声を抽出
                    response = client.recognize(config=config, audio=audio)

                    ### 抽出結果をprintで表示
                    for result in response.results:
                        if "{}".format(result.alternatives[0].transcript) == "":
                            text = ""
                            frames = []
                            break
                        print("Transcript: {}".format(result.alternatives[0].transcript))
                        text = "{}".format(result.alternatives[0].transcript)
                    # ChatGPTにテキストを送信して回答を受け取る
                    if text != "":
                        #answer = ask_gpt("what your name?")
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                #{"role": "system", "content": "日本語で返事してください"},
                                {"role": "user", "content": text}
                            ]
                        )
                        #print(type(response))
                        print("response" + response.choices[0]["message"]["content"].strip())
                        text = response.choices[0]["message"]["content"].strip()
                        #print(response)
                        #print(answer)
                     #ChatGPTの回答を音声データに変換してファイルに保存する
                        language = "ja"

                     # 保存ファイル名
                        name = "gTTS_Text2Speech"

                     # 関数実行
                        text_to_speech(text, language, name)
                        #with wave.open("output.wav", "wb") as f:
                        #    f.setnchannels(channels)
                        #    f.setsampwidth(sample_width)
                        #    f.setframerate(sample_rate)
                        #print("Done.")
                    # Cloud Text-to-Speech APIのクライアントを初期化する
                        #client = texttospeech.TextToSpeechClient()
                    # 音楽ファイルの読み込み
                        pygame.mixer.music.load("gTTS_Text2Speech.mp3")
                    # 音楽再生
                        pygame.mixer.music.play(0)
                        while True:
                            if(pygame.mixer.music.get_busy()!=True):
                                break
                            time.sleep(0.2)
                        pygame.mixer.music.stop()
                        text = ""
                    # 回答を音声データに変換する
                        #input_text = messages  # 元のテキストと回答を結合する
                        #synthesis_input = texttospeech.SynthesisInput(text=input_text)
                        #voice = texttospeech.VoiceSelectionParams(
                        #    language_code="ja-JP",
                        #    name="ja-JP-Wavenet-A",
                        #    ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,
                        #)
                        #audio_config = texttospeech.AudioConfig(
                        #    audio_encoding=texttospeech.AudioEncoding.LINEAR16,
                        #    sample_rate_hertz=sample_rate,
                        #)
                        #response = client.synthesize_speech(
                        #    input=synthesis_input, voice=voice, audio_config=audio_config
                        #)

                    # 回答を音声ファイルに保存する
                        #with wave.open("output.wav", "wb") as f:
                            #f.setnchannels(channels)
                            #f.setsampwidth(sample_width)
                            #f.setframerate(sample_rate)
                            #f.writeframes(response.audio_content)

                        print("Done.")
                    #frames =  []
                   # Debug: Print amplitude
        #print(f"Amplitude: {amplitude}")
        #print(type)
    except KeyboardInterrupt:
        break

stream.stop_stream()

p.terminate()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

import pyaudio

import struct

import math

import audioop

import wave

import time

import os

import io

from google.cloud import speech

import openai

from google.cloud import texttospeech

from gtts import gTTS

import pygame.mixer

pygame.mixer.init()

openai.api_key = ""

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = ''

# Instantiates a client

client = speech.SpeechClient()

### 音声データを指定

speech_file = 'output.wav'

chunk = 1024

FORMAT = pyaudio.paInt16

CHANNELS = 1

RATE = 44100

INPUT_DEVICE_INDEX = 0

THRESHOLD = 700

SILENCE_LIMIT = 3

RECORD_SECONDS = 5

WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

def get_rms(block):

"""

Returns the root mean square of the audio block.

"""

return audioop.rms(block, 2)

def get_audio_input_device(p, input_device_index, channels=1, rate=44100, frames_per_buffer=1024):

stream = p.open(format=pyaudio.paInt16,

channels=channels,

rate=rate,

input=True,

input_device_index=input_device_index,

frames_per_buffer=frames_per_buffer)

return stream

def ask_gpt(text):

response = openai.Completion.create(

engine="davinci", # 使用する言語モデルを指定する

prompt=text,

max_tokens=1024, # 応答の長さを指定する

n=1, # 応答の候補数を指定する

stop=None, # 応答の終了条件を指定する

)

return response.choices[0].text.strip()

# text to speech関数（引数：変換テキスト、対応言語、出力ファイル名）

def text_to_speech(text, language, name):

# gTTSインスタンスの作成

text2speech = gTTS(text, # 音声変換するテキスト

lang=language, # 対応言語（ja：日本語）

)

# 音声変換したデータをファイルに保存

text2speech.save(name + ".mp3")

return True

stream = get_audio_input_device(p, INPUT_DEVICE_INDEX, CHANNELS, RATE, chunk)

is_recording = False

silence_counter = 0

frames = []

#frames = [bytes(frame) for frame in frames]

type = "nane"

t_end = time.time()

t_time = 3

text = ""

while True:

try:

block = stream.read(chunk, exception_on_overflow=False)

amplitude = get_rms(block)

if not is_recording:

if amplitude > THRESHOLD:

is_recording = True

print("Recording started")

frames = []

elif is_recording:

if amplitude > THRESHOLD:

silence_counter = 0

# Record audio block

frames.append(block)

type = "on"

print("rokuon")

else:

if type == "on":

cdown = time.time()

#for i in range(0, int(RATE / chunk * RECORD_SECONDS)):

while True:

if time.time() < cdown + 3 and amplitude < THRESHOLD:

#data = stream.read(block)

block = stream.read(chunk, exception_on_overflow=False)

frames.append(block)

#t_end = time.time()

#t_time = 3

#if time.time() < t_end + t_time:

#frames.append(block)

#print("kuuhaku")

else:

#print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa")

break

#time.sleep(3)

type = "off"

silence_counter += 1

if silence_counter > (RATE / chunk) * SILENCE_LIMIT:

is_recording = False

silence_counter = 0

print("Recording stopped")

# Save recorded audio to file

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')

wf.setnchannels(CHANNELS)

wf.setsampwidth(p.get_sample_size(FORMAT))

wf.setframerate(RATE)

#print(frames)

#frames = [struct.pack('h', s) for s in frames]

wf.writeframes(b''.join(frames))

wf.close()

frames = []

#print("nnnnnnnnnnnnaaaaaaaaaaaaaaaaaaasssssssssssssiiiiiiiiii")

# 音声ファイルを読み込む

with wave.open("output.wav", "rb") as f:

frames = f.readframes(f.getnframes())

sample_rate = f.getframerate()

sample_width = f.getsampwidth()

channels = f.getnchannels()

print("wait....")

### rb(read binary)でデータを読み込む

with io.open(speech_file, 'rb') as f:

content = f.read()

### RecognitionAudioにデータを渡す

audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(

### encodeでエラーが出たのでENCODING_UNSPECIFIEDに変更

encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,

sample_rate_hertz=44100,

language_code="ja-JP",

)

### 音声を抽出

response = client.recognize(config=config, audio=audio)

### 抽出結果をprintで表示

for result in response.results:

if "{}".format(result.alternatives[0].transcript) == "":

text = ""

frames = []

break

print("Transcript: {}".format(result.alternatives[0].transcript))

text = "{}".format(result.alternatives[0].transcript)

# ChatGPTにテキストを送信して回答を受け取る

if text != "":

#answer = ask_gpt("what your name?")

response = openai.ChatCompletion.create(

model="gpt-3.5-turbo",

messages=[

#{"role": "system", "content": "日本語で返事してください"},

{"role": "user", "content": text}

]

)

#print(type(response))

print("response" + response.choices[0]["message"]["content"].strip())

text = response.choices[0]["message"]["content"].strip()

#print(response)

#print(answer)

#ChatGPTの回答を音声データに変換してファイルに保存する

language = "ja"

# 保存ファイル名

name = "gTTS_Text2Speech"

# 関数実行

text_to_speech(text, language, name)

#with wave.open("output.wav", "wb") as f:

# f.setnchannels(channels)

# f.setsampwidth(sample_width)

# f.setframerate(sample_rate)

#print("Done.")

# Cloud Text-to-Speech APIのクライアントを初期化する

#client = texttospeech.TextToSpeechClient()

# 音楽ファイルの読み込み

pygame.mixer.music.load("gTTS_Text2Speech.mp3")

# 音楽再生

pygame.mixer.music.play(0)

while True:

if(pygame.mixer.music.get_busy()!=True):

break

time.sleep(0.2)

pygame.mixer.music.stop()

text = ""

# 回答を音声データに変換する

#input_text = messages # 元のテキストと回答を結合する

#synthesis_input = texttospeech.SynthesisInput(text=input_text)

#voice = texttospeech.VoiceSelectionParams(

# language_code="ja-JP",

# name="ja-JP-Wavenet-A",

# ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,

#audio_config = texttospeech.AudioConfig(

# audio_encoding=texttospeech.AudioEncoding.LINEAR16,

# sample_rate_hertz=sample_rate,

#response = client.synthesize_speech(

# input=synthesis_input, voice=voice, audio_config=audio_config

# 回答を音声ファイルに保存する

#with wave.open("output.wav", "wb") as f:

#f.setnchannels(channels)

#f.setsampwidth(sample_width)

#f.setframerate(sample_rate)

#f.writeframes(response.audio_content)

print("Done.")

#frames = []

# Debug: Print amplitude

#print(f"Amplitude: {amplitude}")

#print(type)

except KeyboardInterrupt:

break

stream.stop_stream()

p.terminate()

流れ的には

１，音声を拾う（閾値を超えた音だけ）

２，音声をテキスト化する（Speech-to-Text）

３，テキストをchatgptへ投げる

４，回答をテキストで受け取り音声化する（gtts）

５，再生させる

となります。

音声が閾値を下回っている間は録音待ちでループし続け、閾値を超えれば録音を開始し、音声が3秒程度途切れない間は１つの文として録音を続けます。

音声が3秒以上途切れれば録音を停止し、上記１～５を実行。その後はまた録音待ちのループに戻ります。

完成

やっとchatgptと音声で会話できるようになった。これ皆サクッとやってる感じだけど結構大変だったよ(-_-;)
あとはこれ応用してなんか作る pic.twitter.com/DuZloJVxcP
— ロビヲ (@ketunorobio) May 6, 2023

使うもの

実現したい内容

事前準備

プログラム

完成

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル