import numpy import math import pyaudio as pa from opaudio import audioop import threading import collections class AudioSource(object): def __init__(self): raise NotImplementedError("this is an abstract class") def __enter__(self): raise NotImplementedError("this is an abstract class") def __exit__(self, exc_type, exc_value, traceback): raise NotImplementedError("this is an abstract class") class Microphone(AudioSource): def __init__(self, device_info=None, sample_rate=None, chunk_size=1024): assert device_info is not None, "device_info must not be None (see Microphone.select)" a = pa.PyAudio() try: if sample_rate is None: assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio" sample_rate = int(device_info.get('defaultSampleRate')) self.device_info = device_info finally: a.terminate() self.format = pa.paInt16 self.SAMPLE_WIDTH = pa.get_sample_size(self.format) self.SAMPLE_RATE = sample_rate self.CHUNK = chunk_size self.audio = None self.stream = None @staticmethod def select(): n = 0 microphones = [] a = pa.PyAudio() for i in range(a.get_device_count()): d = a.get_device_info_by_index(i) if(d.get('maxInputChannels') > 0): microphones.append(d) print(f"{n}. {d.get('name')}") n += 1 while True: sel = input("select microphone: ") if not sel.isdigit() or int(sel) > n: print("Wrong selection.") continue m = microphones[int(sel)] a.terminate() return m def __enter__(self): assert self.stream is None, "Source already streaming" self.a = pa.PyAudio() try: pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True) self.stream = Microphone.Stream(pa_stream) except Exception as e: print(e) self.a.terminate() return self def __exit__(self, exc_type, exc_value, traceback): try: self.stream.close() finally: self.stream = None self.a.terminate() class Stream(object): def __init__(self, pa_stream): self.pa_stream = pa_stream def read(self, size): return self.pa_stream.read(size, exception_on_overflow=False) def close(self): try: if not self.pa_stream.is_stopped(): self.pa_stream.stop_stream() finally: self.pa_stream.close() class WaitTimeoutError(Exception): pass class Listener(AudioSource): def __init__(self): self.energy_threshold = 300 # minimum audio energy to consider for recording self.dynamic_energy_threshold = True self.dynamic_energy_adjustment_damping = 0.15 self.dynamic_energy_ratio = 1.5 self.pause_threshold = 0.8 # seconds of non-speaking audio before a phrase is considered complete self.operation_timeout = None # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout self.phrase_threshold = 0.5 # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops) self.non_speaking_duration = 0.5 # seconds of non-speaking audio to keep on both sides of the recording def check_source(self, source): assert isinstance(source, AudioSource), "Source must be an AudioSource" assert source.stream is not None, "Source must be streaming" def dynamic_thresholding(self, energy, buffer, seconds_per_buffer): damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer target_energy = energy * self.dynamic_energy_ratio self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping) def adjust_ambient_noise(self, source, duration=1): self.check_source(source) seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE elapsed_time = 0 print(f"Adjust ambient noise {duration}") while True: elapsed_time += seconds_per_buffer if elapsed_time > duration: break buffer = source.stream.read(source.CHUNK) energy = audioop.rms(buffer, source.SAMPLE_WIDTH) self.dynamic_thresholding(energy, buffer, seconds_per_buffer) def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None): self.check_source(source) seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE elapsed_time = 0 pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer)) phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer)) non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer)) buffer = b"" pause_cnt = 0 phrase_cnt = 0 timed_out = False while True: frames = collections.deque() while True: elapsed_time += seconds_per_buffer if listen_timeout and elapsed_time > listen_timeout: raise WaitTimeoutError("Listener timed out while waiting for input") # # print("timeout") # if phrase_cnt > 0: # timed_out = True # break # else: # raise WaitTimeoutError("Listener timed out while waiting for input") buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break frames.append(buffer) if len(frames) > non_speaking_buffer_cnt: frames.popleft() energy = audioop.rms(buffer, source.SAMPLE_WIDTH) ############################################### # print(f"{energy} - {self.energy_threshold}") if energy > self.energy_threshold: break if self.dynamic_energy_threshold: print("dynamic_thresholding") self.dynamic_thresholding(energy, buffer, seconds_per_buffer) if timed_out: break phrase_start_time = elapsed_time if is_listening_cb: is_listening_cb(True) while True: elapsed_time += seconds_per_buffer if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout: break buffer = source.stream.read(source.CHUNK) if len(buffer) == 0: break frames.append(buffer) energy = audioop.rms(buffer, source.SAMPLE_WIDTH) if energy > self.energy_threshold: pause_cnt = 0 phrase_cnt += 1 # print(f"phrase {phrase_cnt}") else: pause_cnt += 1 # print(f"pause {pause_cnt}") if pause_cnt > pause_buffer_cnt: # print(f"pause {pause_cnt} - {pause_buffer_cnt}") break # print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}") if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0: break # frames.append(buffer) # phrase_cnt += 1 # print(f"phrase {phrase_cnt}") # energy = audioop.rms(buffer, source.SAMPLE_WIDTH) # if energy > self.energy_threshold: # pause_cnt = 0 # else: # pause_cnt += 1 # print(f"pause {pause_cnt}") # if pause_cnt > pause_buffer_cnt: # print(f"pause {pause_cnt} - {pause_buffer_cnt}") # break # phrase_cnt -= pause_cnt # print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}") # if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0: # break if is_listening_cb: is_listening_cb(False) if frames: for i in range(pause_cnt - non_speaking_buffer_cnt): frames.pop() frame_data = b"".join(frames) return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH) def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None): assert isinstance(source, AudioSource), "Source must be an AudioSource" running = [True] def listen_in_thread(): with source: while running[0]: try: data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb) except WaitTimeoutError: is_listening_cb(False) pass else: if running[0]: listen_cb(self, data) def stopper(wait_join_stop=True): running[0] = False if wait_join_stop: listener_thread.join() listener_thread = threading.Thread(target=listen_in_thread) listener_thread.deamon = True listener_thread.start() return stopper