This commit is contained in:
gauthiier 2024-04-24 09:42:52 +02:00
parent 2095207a1d
commit 6754406c0c
6 changed files with 469 additions and 0 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "opaudio"]
path = opaudio
url = https://grrrit.le-club-des-sans-sujets.org/Le-Club-des-Sans-Sujets/opaudio.git

289
audio.py Normal file
View File

@ -0,0 +1,289 @@
import numpy
import math
import pyaudio as pa
from opaudio import audioop
import threading
import collections
class AudioSource(object):
def __init__(self):
raise NotImplementedError("this is an abstract class")
def __enter__(self):
raise NotImplementedError("this is an abstract class")
def __exit__(self, exc_type, exc_value, traceback):
raise NotImplementedError("this is an abstract class")
class Microphone(AudioSource):
def __init__(self, device_info=None, sample_rate=None, chunk_size=1024):
assert device_info is not None, "device_info must not be None (see Microphone.select)"
a = pa.PyAudio()
try:
if sample_rate is None:
assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio"
sample_rate = int(device_info.get('defaultSampleRate'))
self.device_info = device_info
finally:
a.terminate()
self.format = pa.paInt16
self.SAMPLE_WIDTH = pa.get_sample_size(self.format)
self.SAMPLE_RATE = sample_rate
self.CHUNK = chunk_size
self.audio = None
self.stream = None
@staticmethod
def select():
n = 0
microphones = []
a = pa.PyAudio()
for i in range(a.get_device_count()):
d = a.get_device_info_by_index(i)
if(d.get('maxInputChannels') > 0):
microphones.append(d)
print(f"{n}. {d.get('name')}")
n += 1
while True:
sel = input("select microphone: ")
if not sel.isdigit() or int(sel) > n:
print("Wrong selection.")
continue
m = microphones[int(sel)]
a.terminate()
return m
def __enter__(self):
assert self.stream is None, "Source already streaming"
self.a = pa.PyAudio()
try:
pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True)
self.stream = Microphone.Stream(pa_stream)
except Exception as e:
print(e)
self.a.terminate()
return self
def __exit__(self, exc_type, exc_value, traceback):
try:
self.stream.close()
finally:
self.stream = None
self.a.terminate()
class Stream(object):
def __init__(self, pa_stream):
self.pa_stream = pa_stream
def read(self, size):
return self.pa_stream.read(size, exception_on_overflow=False)
def close(self):
try:
if not self.pa_stream.is_stopped():
self.pa_stream.stop_stream()
finally:
self.pa_stream.close()
class WaitTimeoutError(Exception):
pass
class Listener(AudioSource):
def __init__(self):
self.energy_threshold = 300 # minimum audio energy to consider for recording
self.dynamic_energy_threshold = True
self.dynamic_energy_adjustment_damping = 0.15
self.dynamic_energy_ratio = 1.5
self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete
self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
self.phrase_threshold = 0.5 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording
def check_source(self, source):
assert isinstance(source, AudioSource), "Source must be an AudioSource"
assert source.stream is not None, "Source must be streaming"
def dynamic_thresholding(self, energy, buffer, seconds_per_buffer):
damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer
target_energy = energy * self.dynamic_energy_ratio
self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
def adjust_ambient_noise(self, source, duration=1):
self.check_source(source)
seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
elapsed_time = 0
print(f"Adjust ambient noise {duration}")
while True:
elapsed_time += seconds_per_buffer
if elapsed_time > duration:
break
buffer = source.stream.read(source.CHUNK)
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None):
self.check_source(source)
seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
elapsed_time = 0
pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer))
phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer))
non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer))
buffer = b""
pause_cnt = 0
phrase_cnt = 0
timed_out = False
while True:
frames = collections.deque()
while True:
elapsed_time += seconds_per_buffer
if listen_timeout and elapsed_time > listen_timeout:
raise WaitTimeoutError("Listener timed out while waiting for input")
# # print("timeout")
# if phrase_cnt > 0:
# timed_out = True
# break
# else:
# raise WaitTimeoutError("Listener timed out while waiting for input")
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break
frames.append(buffer)
if len(frames) > non_speaking_buffer_cnt:
frames.popleft()
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
###############################################
# print(f"{energy} - {self.energy_threshold}")
if energy > self.energy_threshold:
break
if self.dynamic_energy_threshold:
print("dynamic_thresholding")
self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
if timed_out:
break
phrase_start_time = elapsed_time
if is_listening_cb:
is_listening_cb(True)
while True:
elapsed_time += seconds_per_buffer
if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout:
break
buffer = source.stream.read(source.CHUNK)
if len(buffer) == 0:
break
frames.append(buffer)
energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
if energy > self.energy_threshold:
pause_cnt = 0
phrase_cnt += 1
# print(f"phrase {phrase_cnt}")
else:
pause_cnt += 1
# print(f"pause {pause_cnt}")
if pause_cnt > pause_buffer_cnt:
# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
break
# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
break
# frames.append(buffer)
# phrase_cnt += 1
# print(f"phrase {phrase_cnt}")
# energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
# if energy > self.energy_threshold:
# pause_cnt = 0
# else:
# pause_cnt += 1
# print(f"pause {pause_cnt}")
# if pause_cnt > pause_buffer_cnt:
# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
# break
# phrase_cnt -= pause_cnt
# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
# if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
# break
if is_listening_cb:
is_listening_cb(False)
if frames:
for i in range(pause_cnt - non_speaking_buffer_cnt):
frames.pop()
frame_data = b"".join(frames)
return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None):
assert isinstance(source, AudioSource), "Source must be an AudioSource"
running = [True]
def listen_in_thread():
with source:
while running[0]:
try:
data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb)
except WaitTimeoutError:
is_listening_cb(False)
pass
else:
if running[0]:
listen_cb(self, data)
def stopper(wait_join_stop=True):
running[0] = False
if wait_join_stop:
listener_thread.join()
listener_thread = threading.Thread(target=listen_in_thread)
listener_thread.deamon = True
listener_thread.start()
return stopper

1
opaudio Submodule

@ -0,0 +1 @@
Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
numpy
openai-whisper
PyAudio

142
transcribe.py Normal file
View File

@ -0,0 +1,142 @@
import argparse
import numpy
import whisper
import torch
import wave
import os
from datetime import datetime, timedelta
from time import sleep
from queue import Queue
from enum import Enum
import audio
class State(Enum):
IDLE = 1
TRANSCRIBING = 2
LISTENING = 3
state = State.IDLE
def main():
p = argparse.ArgumentParser(description="TRANSCRIPTUM")
p.add_argument("--model", default="medium", help="Whisper model", choices=["tiny", "base", "small", "medium", "large"])
p.add_argument("--rms", default=1000, help="RMS (energy) threshold for microphone to detect", type=int)
p.add_argument("--record_timeout", default=8, help="Timeout for the microphone recording", type=float)
p.add_argument("--phrase_timeout", default=2, help="Silence timeout between phrases", type=float)
p.add_argument("--dynamic_threshold", action="store_true", help="Use dynamic rms threshold?")
args = p.parse_args()
record_timeout = args.record_timeout
phrase_timeout = args.phrase_timeout
dynamic_threshold = args.dynamic_threshold
phrase_time = None
data_queue = Queue()
transcripts = ['']
model = args.model
whisper_model = whisper.load_model(model)
transcribing = False
print("Model loaded.\n")
# select microphone?
source = audio.Microphone.select()
microphone = audio.Microphone(device_info=source, sample_rate=22050)
listener = audio.Listener()
listener.energy_threshold = args.rms
listener.dynamic_energy_threshold = args.dynamic_threshold
# with microphone:
# listener.adjust_ambient_noise(microphone, duration=1)
def print_transcripts(bcolor=None):
os.system("clear")
for l in transcripts:
if bcolor:
print(bcolor + l + '\033[0m')
else:
print(l)
print('', end='', flush=True)
# (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
def listen_callback(_, buffer:tuple) -> None:
data_queue.put(buffer[0])
def is_listening_callback(is_listening):
global state
if is_listening and state != State.LISTENING:
print_transcripts('\033[1m') #bold
state = State.LISTENING
elif state != State.IDLE and state != State.TRANSCRIBING:
print_transcripts()
state = State.IDLE
stop = listener.listen_in_background(source=microphone, listen_cb=listen_callback, listen_timeout=record_timeout, is_listening_cb=is_listening_callback)
os.system("clear")
while True:
try:
now = datetime.utcnow()
if not data_queue.empty():
phrase_complete = False
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
phrase_complete = True
# with data_queue.mutex:
# phrase_time = now
# audio_data = b"".join(data_queue.queue)
# data_queue.queue.clear()
phrase_time = now
audio_data = b"".join(data_queue.queue)
data_queue.queue.clear()
np_data = numpy.frombuffer(audio_data, dtype=numpy.int16).astype(numpy.float32) / 32768.0
# file_name = f"sound{n}.wav"
# with wave.open(file_name, "w") as f:
# f.setnchannels(1)
# f.setsampwidth(2)
# f.setframerate(22050)
# f.writeframes(audio_data)
# n += 1
state = State.LISTENING
# print_transcripts('\033[4m') #underline
print_transcripts('\033[93m') #warning
r = whisper_model.transcribe(np_data, fp16=torch.cuda.is_available())
t = r['text'].strip()
if len(t) > 0:
if phrase_complete:
transcripts.append(t)
else:
transcripts[-1] = t
print_transcripts()
state = State.IDLE
sleep(0.25)
except KeyboardInterrupt:
break
stop(True)
print("\nTranscripts:\n")
for l in transcripts:
print(l)
if __name__ == "__main__":
main()

31
transcriptum.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/bash
if ! [[ "$1" =~ ^(install|clean|run) ]]; then
echo "usage: $0 [action]"
echo "where action can be: [install, clean, run]"
exit 1
fi
case $1 in
install)
echo "intalling virtual environment"
python -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r requirements.txt
;;
clean)
echo "cleaning up"
rm -rf venv
rm -rf __pycache__
;;
run)
echo "running"
source venv/bin/activate
exec python transcribe.py "${@:2}"
esac