diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..b511a66 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "opaudio"] + path = opaudio + url = https://grrrit.le-club-des-sans-sujets.org/Le-Club-des-Sans-Sujets/opaudio.git diff --git a/audio.py b/audio.py new file mode 100644 index 0000000..293a09e --- /dev/null +++ b/audio.py @@ -0,0 +1,289 @@ +import numpy +import math +import pyaudio as pa +from opaudio import audioop +import threading +import collections + + +class AudioSource(object): + def __init__(self): + raise NotImplementedError("this is an abstract class") + + def __enter__(self): + raise NotImplementedError("this is an abstract class") + + def __exit__(self, exc_type, exc_value, traceback): + raise NotImplementedError("this is an abstract class") + +class Microphone(AudioSource): + + def __init__(self, device_info=None, sample_rate=None, chunk_size=1024): + assert device_info is not None, "device_info must not be None (see Microphone.select)" + a = pa.PyAudio() + try: + if sample_rate is None: + assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio" + sample_rate = int(device_info.get('defaultSampleRate')) + self.device_info = device_info + finally: + a.terminate() + + self.format = pa.paInt16 + self.SAMPLE_WIDTH = pa.get_sample_size(self.format) + self.SAMPLE_RATE = sample_rate + self.CHUNK = chunk_size + + self.audio = None + self.stream = None + + @staticmethod + def select(): + n = 0 + microphones = [] + a = pa.PyAudio() + for i in range(a.get_device_count()): + d = a.get_device_info_by_index(i) + if(d.get('maxInputChannels') > 0): + microphones.append(d) + print(f"{n}. {d.get('name')}") + n += 1 + + while True: + sel = input("select microphone: ") + if not sel.isdigit() or int(sel) > n: + print("Wrong selection.") + continue + m = microphones[int(sel)] + a.terminate() + return m + + def __enter__(self): + assert self.stream is None, "Source already streaming" + self.a = pa.PyAudio() + try: + pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True) + self.stream = Microphone.Stream(pa_stream) + except Exception as e: + print(e) + self.a.terminate() + return self + + def __exit__(self, exc_type, exc_value, traceback): + try: + self.stream.close() + finally: + self.stream = None + self.a.terminate() + + class Stream(object): + + def __init__(self, pa_stream): + self.pa_stream = pa_stream + + def read(self, size): + return self.pa_stream.read(size, exception_on_overflow=False) + + def close(self): + try: + if not self.pa_stream.is_stopped(): + self.pa_stream.stop_stream() + finally: + self.pa_stream.close() + +class WaitTimeoutError(Exception): + pass + +class Listener(AudioSource): + + def __init__(self): + self.energy_threshold = 300 # minimum audio energy to consider for recording + self.dynamic_energy_threshold = True + self.dynamic_energy_adjustment_damping = 0.15 + self.dynamic_energy_ratio = 1.5 + self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete + self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout + self.phrase_threshold = 0.5 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) + self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording + + def check_source(self, source): + assert isinstance(source, AudioSource), "Source must be an AudioSource" + assert source.stream is not None, "Source must be streaming" + + def dynamic_thresholding(self, energy, buffer, seconds_per_buffer): + damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer + target_energy = energy * self.dynamic_energy_ratio + self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) + + + def adjust_ambient_noise(self, source, duration=1): + self.check_source(source) + + seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE + elapsed_time = 0 + + print(f"Adjust ambient noise {duration}") + + while True: + elapsed_time += seconds_per_buffer + if elapsed_time > duration: + break + buffer = source.stream.read(source.CHUNK) + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) + + self.dynamic_thresholding(energy, buffer, seconds_per_buffer) + + def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None): + self.check_source(source) + + seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE + elapsed_time = 0 + + pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer)) + phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) + non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) + + buffer = b"" + + pause_cnt = 0 + phrase_cnt = 0 + timed_out = False + + while True: + frames = collections.deque() + + while True: + elapsed_time += seconds_per_buffer + if listen_timeout and elapsed_time > listen_timeout: + raise WaitTimeoutError("Listener timed out while waiting for input") + # # print("timeout") + # if phrase_cnt > 0: + # timed_out = True + # break + # else: + # raise WaitTimeoutError("Listener timed out while waiting for input") + + buffer = source.stream.read(source.CHUNK) + if len(buffer) == 0: + break + + frames.append(buffer) + + if len(frames) > non_speaking_buffer_cnt: + frames.popleft() + + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) + + ############################################### + # print(f"{energy} - {self.energy_threshold}") + + if energy > self.energy_threshold: + break + + if self.dynamic_energy_threshold: + print("dynamic_thresholding") + self.dynamic_thresholding(energy, buffer, seconds_per_buffer) + + if timed_out: + break + + phrase_start_time = elapsed_time + + if is_listening_cb: + is_listening_cb(True) + + while True: + elapsed_time += seconds_per_buffer + + if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout: + break + + buffer = source.stream.read(source.CHUNK) + if len(buffer) == 0: + break + + frames.append(buffer) + + energy = audioop.rms(buffer, source.SAMPLE_WIDTH) + if energy > self.energy_threshold: + pause_cnt = 0 + phrase_cnt += 1 + # print(f"phrase {phrase_cnt}") + else: + pause_cnt += 1 + # print(f"pause {pause_cnt}") + + if pause_cnt > pause_buffer_cnt: + # print(f"pause {pause_cnt} - {pause_buffer_cnt}") + break + + # print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}") + if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0: + break + + + + # frames.append(buffer) + # phrase_cnt += 1 + # print(f"phrase {phrase_cnt}") + + # energy = audioop.rms(buffer, source.SAMPLE_WIDTH) + # if energy > self.energy_threshold: + # pause_cnt = 0 + # else: + # pause_cnt += 1 + # print(f"pause {pause_cnt}") + + # if pause_cnt > pause_buffer_cnt: + # print(f"pause {pause_cnt} - {pause_buffer_cnt}") + # break + + # phrase_cnt -= pause_cnt + # print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}") + # if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0: + # break + + if is_listening_cb: + is_listening_cb(False) + + if frames: + for i in range(pause_cnt - non_speaking_buffer_cnt): + frames.pop() + + frame_data = b"".join(frames) + + return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + + def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None): + assert isinstance(source, AudioSource), "Source must be an AudioSource" + running = [True] + + def listen_in_thread(): + with source: + while running[0]: + try: + data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb) + except WaitTimeoutError: + is_listening_cb(False) + pass + else: + if running[0]: + listen_cb(self, data) + + def stopper(wait_join_stop=True): + running[0] = False + if wait_join_stop: + listener_thread.join() + + listener_thread = threading.Thread(target=listen_in_thread) + listener_thread.deamon = True + listener_thread.start() + return stopper + + + + + + + + diff --git a/opaudio b/opaudio new file mode 160000 index 0000000..2e32c5e --- /dev/null +++ b/opaudio @@ -0,0 +1 @@ +Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..44cf638 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +openai-whisper +PyAudio diff --git a/transcribe.py b/transcribe.py new file mode 100644 index 0000000..de0700e --- /dev/null +++ b/transcribe.py @@ -0,0 +1,142 @@ +import argparse +import numpy +import whisper +import torch +import wave +import os + +from datetime import datetime, timedelta +from time import sleep +from queue import Queue +from enum import Enum + +import audio + +class State(Enum): + IDLE = 1 + TRANSCRIBING = 2 + LISTENING = 3 + +state = State.IDLE + +def main(): + p = argparse.ArgumentParser(description="TRANSCRIPTUM") + p.add_argument("--model", default="medium", help="Whisper model", choices=["tiny", "base", "small", "medium", "large"]) + p.add_argument("--rms", default=1000, help="RMS (energy) threshold for microphone to detect", type=int) + p.add_argument("--record_timeout", default=8, help="Timeout for the microphone recording", type=float) + p.add_argument("--phrase_timeout", default=2, help="Silence timeout between phrases", type=float) + p.add_argument("--dynamic_threshold", action="store_true", help="Use dynamic rms threshold?") + + args = p.parse_args() + + record_timeout = args.record_timeout + phrase_timeout = args.phrase_timeout + dynamic_threshold = args.dynamic_threshold + phrase_time = None + + data_queue = Queue() + transcripts = [''] + + model = args.model + whisper_model = whisper.load_model(model) + transcribing = False + print("Model loaded.\n") + + # select microphone? + source = audio.Microphone.select() + microphone = audio.Microphone(device_info=source, sample_rate=22050) + + listener = audio.Listener() + listener.energy_threshold = args.rms + listener.dynamic_energy_threshold = args.dynamic_threshold + + # with microphone: + # listener.adjust_ambient_noise(microphone, duration=1) + + def print_transcripts(bcolor=None): + os.system("clear") + for l in transcripts: + if bcolor: + print(bcolor + l + '\033[0m') + else: + print(l) + print('', end='', flush=True) + + # (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + def listen_callback(_, buffer:tuple) -> None: + data_queue.put(buffer[0]) + + def is_listening_callback(is_listening): + global state + if is_listening and state != State.LISTENING: + print_transcripts('\033[1m') #bold + state = State.LISTENING + elif state != State.IDLE and state != State.TRANSCRIBING: + print_transcripts() + state = State.IDLE + + stop = listener.listen_in_background(source=microphone, listen_cb=listen_callback, listen_timeout=record_timeout, is_listening_cb=is_listening_callback) + + os.system("clear") + + while True: + try: + now = datetime.utcnow() + + if not data_queue.empty(): + + phrase_complete = False + if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): + phrase_complete = True + + # with data_queue.mutex: + # phrase_time = now + # audio_data = b"".join(data_queue.queue) + # data_queue.queue.clear() + + phrase_time = now + audio_data = b"".join(data_queue.queue) + data_queue.queue.clear() + + np_data = numpy.frombuffer(audio_data, dtype=numpy.int16).astype(numpy.float32) / 32768.0 + + # file_name = f"sound{n}.wav" + # with wave.open(file_name, "w") as f: + # f.setnchannels(1) + # f.setsampwidth(2) + # f.setframerate(22050) + # f.writeframes(audio_data) + # n += 1 + + state = State.LISTENING + # print_transcripts('\033[4m') #underline + print_transcripts('\033[93m') #warning + r = whisper_model.transcribe(np_data, fp16=torch.cuda.is_available()) + + t = r['text'].strip() + + if len(t) > 0: + if phrase_complete: + transcripts.append(t) + else: + transcripts[-1] = t + + print_transcripts() + + state = State.IDLE + + sleep(0.25) + + except KeyboardInterrupt: + break + + + + stop(True) + + print("\nTranscripts:\n") + for l in transcripts: + print(l) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/transcriptum.sh b/transcriptum.sh new file mode 100755 index 0000000..cd7a043 --- /dev/null +++ b/transcriptum.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +if ! [[ "$1" =~ ^(install|clean|run) ]]; then + echo "usage: $0 [action]" + echo "where action can be: [install, clean, run]" + exit 1 +fi + + +case $1 in + + install) + echo "intalling virtual environment" + python -m venv venv + source venv/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + ;; + clean) + echo "cleaning up" + rm -rf venv + rm -rf __pycache__ + ;; + run) + echo "running" + source venv/bin/activate + exec python transcribe.py "${@:2}" + +esac + + \ No newline at end of file