haha
This commit is contained in:
parent
2095207a1d
commit
6754406c0c
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[submodule "opaudio"]
|
||||||
|
path = opaudio
|
||||||
|
url = https://grrrit.le-club-des-sans-sujets.org/Le-Club-des-Sans-Sujets/opaudio.git
|
||||||
289
audio.py
Normal file
289
audio.py
Normal file
@ -0,0 +1,289 @@
|
|||||||
|
import numpy
|
||||||
|
import math
|
||||||
|
import pyaudio as pa
|
||||||
|
from opaudio import audioop
|
||||||
|
import threading
|
||||||
|
import collections
|
||||||
|
|
||||||
|
|
||||||
|
class AudioSource(object):
|
||||||
|
def __init__(self):
|
||||||
|
raise NotImplementedError("this is an abstract class")
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
raise NotImplementedError("this is an abstract class")
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
raise NotImplementedError("this is an abstract class")
|
||||||
|
|
||||||
|
class Microphone(AudioSource):
|
||||||
|
|
||||||
|
def __init__(self, device_info=None, sample_rate=None, chunk_size=1024):
|
||||||
|
assert device_info is not None, "device_info must not be None (see Microphone.select)"
|
||||||
|
a = pa.PyAudio()
|
||||||
|
try:
|
||||||
|
if sample_rate is None:
|
||||||
|
assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio"
|
||||||
|
sample_rate = int(device_info.get('defaultSampleRate'))
|
||||||
|
self.device_info = device_info
|
||||||
|
finally:
|
||||||
|
a.terminate()
|
||||||
|
|
||||||
|
self.format = pa.paInt16
|
||||||
|
self.SAMPLE_WIDTH = pa.get_sample_size(self.format)
|
||||||
|
self.SAMPLE_RATE = sample_rate
|
||||||
|
self.CHUNK = chunk_size
|
||||||
|
|
||||||
|
self.audio = None
|
||||||
|
self.stream = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def select():
|
||||||
|
n = 0
|
||||||
|
microphones = []
|
||||||
|
a = pa.PyAudio()
|
||||||
|
for i in range(a.get_device_count()):
|
||||||
|
d = a.get_device_info_by_index(i)
|
||||||
|
if(d.get('maxInputChannels') > 0):
|
||||||
|
microphones.append(d)
|
||||||
|
print(f"{n}. {d.get('name')}")
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
sel = input("select microphone: ")
|
||||||
|
if not sel.isdigit() or int(sel) > n:
|
||||||
|
print("Wrong selection.")
|
||||||
|
continue
|
||||||
|
m = microphones[int(sel)]
|
||||||
|
a.terminate()
|
||||||
|
return m
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
assert self.stream is None, "Source already streaming"
|
||||||
|
self.a = pa.PyAudio()
|
||||||
|
try:
|
||||||
|
pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True)
|
||||||
|
self.stream = Microphone.Stream(pa_stream)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.a.terminate()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
try:
|
||||||
|
self.stream.close()
|
||||||
|
finally:
|
||||||
|
self.stream = None
|
||||||
|
self.a.terminate()
|
||||||
|
|
||||||
|
class Stream(object):
|
||||||
|
|
||||||
|
def __init__(self, pa_stream):
|
||||||
|
self.pa_stream = pa_stream
|
||||||
|
|
||||||
|
def read(self, size):
|
||||||
|
return self.pa_stream.read(size, exception_on_overflow=False)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
try:
|
||||||
|
if not self.pa_stream.is_stopped():
|
||||||
|
self.pa_stream.stop_stream()
|
||||||
|
finally:
|
||||||
|
self.pa_stream.close()
|
||||||
|
|
||||||
|
class WaitTimeoutError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Listener(AudioSource):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.energy_threshold = 300 # minimum audio energy to consider for recording
|
||||||
|
self.dynamic_energy_threshold = True
|
||||||
|
self.dynamic_energy_adjustment_damping = 0.15
|
||||||
|
self.dynamic_energy_ratio = 1.5
|
||||||
|
self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete
|
||||||
|
self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
|
||||||
|
self.phrase_threshold = 0.5 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
|
||||||
|
self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording
|
||||||
|
|
||||||
|
def check_source(self, source):
|
||||||
|
assert isinstance(source, AudioSource), "Source must be an AudioSource"
|
||||||
|
assert source.stream is not None, "Source must be streaming"
|
||||||
|
|
||||||
|
def dynamic_thresholding(self, energy, buffer, seconds_per_buffer):
|
||||||
|
damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer
|
||||||
|
target_energy = energy * self.dynamic_energy_ratio
|
||||||
|
self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_ambient_noise(self, source, duration=1):
|
||||||
|
self.check_source(source)
|
||||||
|
|
||||||
|
seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
|
||||||
|
elapsed_time = 0
|
||||||
|
|
||||||
|
print(f"Adjust ambient noise {duration}")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
elapsed_time += seconds_per_buffer
|
||||||
|
if elapsed_time > duration:
|
||||||
|
break
|
||||||
|
buffer = source.stream.read(source.CHUNK)
|
||||||
|
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
|
||||||
|
|
||||||
|
self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
|
||||||
|
|
||||||
|
def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None):
|
||||||
|
self.check_source(source)
|
||||||
|
|
||||||
|
seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
|
||||||
|
elapsed_time = 0
|
||||||
|
|
||||||
|
pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer))
|
||||||
|
phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer))
|
||||||
|
non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer))
|
||||||
|
|
||||||
|
buffer = b""
|
||||||
|
|
||||||
|
pause_cnt = 0
|
||||||
|
phrase_cnt = 0
|
||||||
|
timed_out = False
|
||||||
|
|
||||||
|
while True:
|
||||||
|
frames = collections.deque()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
elapsed_time += seconds_per_buffer
|
||||||
|
if listen_timeout and elapsed_time > listen_timeout:
|
||||||
|
raise WaitTimeoutError("Listener timed out while waiting for input")
|
||||||
|
# # print("timeout")
|
||||||
|
# if phrase_cnt > 0:
|
||||||
|
# timed_out = True
|
||||||
|
# break
|
||||||
|
# else:
|
||||||
|
# raise WaitTimeoutError("Listener timed out while waiting for input")
|
||||||
|
|
||||||
|
buffer = source.stream.read(source.CHUNK)
|
||||||
|
if len(buffer) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
frames.append(buffer)
|
||||||
|
|
||||||
|
if len(frames) > non_speaking_buffer_cnt:
|
||||||
|
frames.popleft()
|
||||||
|
|
||||||
|
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
|
||||||
|
|
||||||
|
###############################################
|
||||||
|
# print(f"{energy} - {self.energy_threshold}")
|
||||||
|
|
||||||
|
if energy > self.energy_threshold:
|
||||||
|
break
|
||||||
|
|
||||||
|
if self.dynamic_energy_threshold:
|
||||||
|
print("dynamic_thresholding")
|
||||||
|
self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
|
||||||
|
|
||||||
|
if timed_out:
|
||||||
|
break
|
||||||
|
|
||||||
|
phrase_start_time = elapsed_time
|
||||||
|
|
||||||
|
if is_listening_cb:
|
||||||
|
is_listening_cb(True)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
elapsed_time += seconds_per_buffer
|
||||||
|
|
||||||
|
if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout:
|
||||||
|
break
|
||||||
|
|
||||||
|
buffer = source.stream.read(source.CHUNK)
|
||||||
|
if len(buffer) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
frames.append(buffer)
|
||||||
|
|
||||||
|
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
|
||||||
|
if energy > self.energy_threshold:
|
||||||
|
pause_cnt = 0
|
||||||
|
phrase_cnt += 1
|
||||||
|
# print(f"phrase {phrase_cnt}")
|
||||||
|
else:
|
||||||
|
pause_cnt += 1
|
||||||
|
# print(f"pause {pause_cnt}")
|
||||||
|
|
||||||
|
if pause_cnt > pause_buffer_cnt:
|
||||||
|
# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
|
||||||
|
break
|
||||||
|
|
||||||
|
# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
|
||||||
|
if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# frames.append(buffer)
|
||||||
|
# phrase_cnt += 1
|
||||||
|
# print(f"phrase {phrase_cnt}")
|
||||||
|
|
||||||
|
# energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
|
||||||
|
# if energy > self.energy_threshold:
|
||||||
|
# pause_cnt = 0
|
||||||
|
# else:
|
||||||
|
# pause_cnt += 1
|
||||||
|
# print(f"pause {pause_cnt}")
|
||||||
|
|
||||||
|
# if pause_cnt > pause_buffer_cnt:
|
||||||
|
# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
|
||||||
|
# break
|
||||||
|
|
||||||
|
# phrase_cnt -= pause_cnt
|
||||||
|
# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
|
||||||
|
# if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
|
||||||
|
# break
|
||||||
|
|
||||||
|
if is_listening_cb:
|
||||||
|
is_listening_cb(False)
|
||||||
|
|
||||||
|
if frames:
|
||||||
|
for i in range(pause_cnt - non_speaking_buffer_cnt):
|
||||||
|
frames.pop()
|
||||||
|
|
||||||
|
frame_data = b"".join(frames)
|
||||||
|
|
||||||
|
return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
||||||
|
|
||||||
|
def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None):
|
||||||
|
assert isinstance(source, AudioSource), "Source must be an AudioSource"
|
||||||
|
running = [True]
|
||||||
|
|
||||||
|
def listen_in_thread():
|
||||||
|
with source:
|
||||||
|
while running[0]:
|
||||||
|
try:
|
||||||
|
data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb)
|
||||||
|
except WaitTimeoutError:
|
||||||
|
is_listening_cb(False)
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if running[0]:
|
||||||
|
listen_cb(self, data)
|
||||||
|
|
||||||
|
def stopper(wait_join_stop=True):
|
||||||
|
running[0] = False
|
||||||
|
if wait_join_stop:
|
||||||
|
listener_thread.join()
|
||||||
|
|
||||||
|
listener_thread = threading.Thread(target=listen_in_thread)
|
||||||
|
listener_thread.deamon = True
|
||||||
|
listener_thread.start()
|
||||||
|
return stopper
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1
opaudio
Submodule
1
opaudio
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b
|
||||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
numpy
|
||||||
|
openai-whisper
|
||||||
|
PyAudio
|
||||||
142
transcribe.py
Normal file
142
transcribe.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import argparse
|
||||||
|
import numpy
|
||||||
|
import whisper
|
||||||
|
import torch
|
||||||
|
import wave
|
||||||
|
import os
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from time import sleep
|
||||||
|
from queue import Queue
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
import audio
|
||||||
|
|
||||||
|
class State(Enum):
|
||||||
|
IDLE = 1
|
||||||
|
TRANSCRIBING = 2
|
||||||
|
LISTENING = 3
|
||||||
|
|
||||||
|
state = State.IDLE
|
||||||
|
|
||||||
|
def main():
|
||||||
|
p = argparse.ArgumentParser(description="TRANSCRIPTUM")
|
||||||
|
p.add_argument("--model", default="medium", help="Whisper model", choices=["tiny", "base", "small", "medium", "large"])
|
||||||
|
p.add_argument("--rms", default=1000, help="RMS (energy) threshold for microphone to detect", type=int)
|
||||||
|
p.add_argument("--record_timeout", default=8, help="Timeout for the microphone recording", type=float)
|
||||||
|
p.add_argument("--phrase_timeout", default=2, help="Silence timeout between phrases", type=float)
|
||||||
|
p.add_argument("--dynamic_threshold", action="store_true", help="Use dynamic rms threshold?")
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
record_timeout = args.record_timeout
|
||||||
|
phrase_timeout = args.phrase_timeout
|
||||||
|
dynamic_threshold = args.dynamic_threshold
|
||||||
|
phrase_time = None
|
||||||
|
|
||||||
|
data_queue = Queue()
|
||||||
|
transcripts = ['']
|
||||||
|
|
||||||
|
model = args.model
|
||||||
|
whisper_model = whisper.load_model(model)
|
||||||
|
transcribing = False
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# select microphone?
|
||||||
|
source = audio.Microphone.select()
|
||||||
|
microphone = audio.Microphone(device_info=source, sample_rate=22050)
|
||||||
|
|
||||||
|
listener = audio.Listener()
|
||||||
|
listener.energy_threshold = args.rms
|
||||||
|
listener.dynamic_energy_threshold = args.dynamic_threshold
|
||||||
|
|
||||||
|
# with microphone:
|
||||||
|
# listener.adjust_ambient_noise(microphone, duration=1)
|
||||||
|
|
||||||
|
def print_transcripts(bcolor=None):
|
||||||
|
os.system("clear")
|
||||||
|
for l in transcripts:
|
||||||
|
if bcolor:
|
||||||
|
print(bcolor + l + '\033[0m')
|
||||||
|
else:
|
||||||
|
print(l)
|
||||||
|
print('', end='', flush=True)
|
||||||
|
|
||||||
|
# (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
||||||
|
def listen_callback(_, buffer:tuple) -> None:
|
||||||
|
data_queue.put(buffer[0])
|
||||||
|
|
||||||
|
def is_listening_callback(is_listening):
|
||||||
|
global state
|
||||||
|
if is_listening and state != State.LISTENING:
|
||||||
|
print_transcripts('\033[1m') #bold
|
||||||
|
state = State.LISTENING
|
||||||
|
elif state != State.IDLE and state != State.TRANSCRIBING:
|
||||||
|
print_transcripts()
|
||||||
|
state = State.IDLE
|
||||||
|
|
||||||
|
stop = listener.listen_in_background(source=microphone, listen_cb=listen_callback, listen_timeout=record_timeout, is_listening_cb=is_listening_callback)
|
||||||
|
|
||||||
|
os.system("clear")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
now = datetime.utcnow()
|
||||||
|
|
||||||
|
if not data_queue.empty():
|
||||||
|
|
||||||
|
phrase_complete = False
|
||||||
|
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
|
||||||
|
phrase_complete = True
|
||||||
|
|
||||||
|
# with data_queue.mutex:
|
||||||
|
# phrase_time = now
|
||||||
|
# audio_data = b"".join(data_queue.queue)
|
||||||
|
# data_queue.queue.clear()
|
||||||
|
|
||||||
|
phrase_time = now
|
||||||
|
audio_data = b"".join(data_queue.queue)
|
||||||
|
data_queue.queue.clear()
|
||||||
|
|
||||||
|
np_data = numpy.frombuffer(audio_data, dtype=numpy.int16).astype(numpy.float32) / 32768.0
|
||||||
|
|
||||||
|
# file_name = f"sound{n}.wav"
|
||||||
|
# with wave.open(file_name, "w") as f:
|
||||||
|
# f.setnchannels(1)
|
||||||
|
# f.setsampwidth(2)
|
||||||
|
# f.setframerate(22050)
|
||||||
|
# f.writeframes(audio_data)
|
||||||
|
# n += 1
|
||||||
|
|
||||||
|
state = State.LISTENING
|
||||||
|
# print_transcripts('\033[4m') #underline
|
||||||
|
print_transcripts('\033[93m') #warning
|
||||||
|
r = whisper_model.transcribe(np_data, fp16=torch.cuda.is_available())
|
||||||
|
|
||||||
|
t = r['text'].strip()
|
||||||
|
|
||||||
|
if len(t) > 0:
|
||||||
|
if phrase_complete:
|
||||||
|
transcripts.append(t)
|
||||||
|
else:
|
||||||
|
transcripts[-1] = t
|
||||||
|
|
||||||
|
print_transcripts()
|
||||||
|
|
||||||
|
state = State.IDLE
|
||||||
|
|
||||||
|
sleep(0.25)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
stop(True)
|
||||||
|
|
||||||
|
print("\nTranscripts:\n")
|
||||||
|
for l in transcripts:
|
||||||
|
print(l)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
31
transcriptum.sh
Executable file
31
transcriptum.sh
Executable file
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if ! [[ "$1" =~ ^(install|clean|run) ]]; then
|
||||||
|
echo "usage: $0 [action]"
|
||||||
|
echo "where action can be: [install, clean, run]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
case $1 in
|
||||||
|
|
||||||
|
install)
|
||||||
|
echo "intalling virtual environment"
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
;;
|
||||||
|
clean)
|
||||||
|
echo "cleaning up"
|
||||||
|
rm -rf venv
|
||||||
|
rm -rf __pycache__
|
||||||
|
;;
|
||||||
|
run)
|
||||||
|
echo "running"
|
||||||
|
source venv/bin/activate
|
||||||
|
exec python transcribe.py "${@:2}"
|
||||||
|
|
||||||
|
esac
|
||||||
|
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user