haha

2024-04-24 09:42:52 +02:00 · 2024-04-24 09:42:52 +02:00 · 6754406c0c
commit 6754406c0c
parent 2095207a1d
6 changed files with 469 additions and 0 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "opaudio"]
 	path = opaudio
 	url = https://grrrit.le-club-des-sans-sujets.org/Le-Club-des-Sans-Sujets/opaudio.git
--- a/audio.py
+++ b/audio.py
@ -0,0 +1,289 @@
 import numpy
 import math
 import pyaudio as pa
 from opaudio import audioop
 import threading
 import collections
 class AudioSource(object):
 	def __init__(self):
 		raise NotImplementedError("this is an abstract class")
 	def __enter__(self):
 		raise NotImplementedError("this is an abstract class")
 	def __exit__(self, exc_type, exc_value, traceback):
 		raise NotImplementedError("this is an abstract class")
 class Microphone(AudioSource):
 	def __init__(self, device_info=None, sample_rate=None, chunk_size=1024):
 		assert device_info is not None, "device_info must not be None (see Microphone.select)"
 		a = pa.PyAudio()
 		try:			
 			if sample_rate is None:
 				assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio"
 				sample_rate = int(device_info.get('defaultSampleRate'))
 			self.device_info = device_info
 		finally:
 			a.terminate()
 		self.format = pa.paInt16
 		self.SAMPLE_WIDTH = pa.get_sample_size(self.format)
 		self.SAMPLE_RATE = sample_rate
 		self.CHUNK = chunk_size
 		self.audio = None
 		self.stream = None
 	@staticmethod
 	def select():
 		n = 0
 		microphones = []
 		a = pa.PyAudio()
 		for i in range(a.get_device_count()):
 			d = a.get_device_info_by_index(i)
 			if(d.get('maxInputChannels') > 0):
 				microphones.append(d)
 				print(f"{n}. {d.get('name')}")
 				n += 1
 		while True:
 			sel = input("select microphone: ")
 			if not sel.isdigit() or int(sel) > n:
 				print("Wrong selection.")
 				continue
 			m = microphones[int(sel)]
 			a.terminate()
 			return m
 	def __enter__(self):
 		assert self.stream is None, "Source already streaming"
 		self.a = pa.PyAudio()
 		try:
 			pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True)
 			self.stream = Microphone.Stream(pa_stream)
 		except Exception as e:
 			print(e)
 			self.a.terminate()
 		return self
 	def __exit__(self, exc_type, exc_value, traceback):
 		try:
 			self.stream.close()
 		finally:
 			self.stream = None
 			self.a.terminate()
 	class Stream(object):
 		def __init__(self, pa_stream):
 			self.pa_stream = pa_stream
 		def read(self, size):
 			return self.pa_stream.read(size, exception_on_overflow=False)
 		def close(self):
 			try:
 				if not self.pa_stream.is_stopped():
 					self.pa_stream.stop_stream()
 			finally:
 				self.pa_stream.close()
 class WaitTimeoutError(Exception):
 	pass
 class Listener(AudioSource):
 	def __init__(self):
 		self.energy_threshold = 300  # minimum audio energy to consider for recording
 		self.dynamic_energy_threshold = True
 		self.dynamic_energy_adjustment_damping = 0.15
 		self.dynamic_energy_ratio = 1.5
 		self.pause_threshold = 0.8  # seconds of non-speaking audio before a phrase is considered complete
 		self.operation_timeout = None  # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
 		self.phrase_threshold = 0.5  # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
 		self.non_speaking_duration = 0.5  # seconds of non-speaking audio to keep on both sides of the recording
 	def check_source(self, source):    
 		assert isinstance(source, AudioSource), "Source must be an AudioSource"
 		assert source.stream is not None, "Source must be streaming"
 	def dynamic_thresholding(self, energy, buffer, seconds_per_buffer):
 			damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer
 			target_energy = energy * self.dynamic_energy_ratio
 			self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
 	def adjust_ambient_noise(self, source, duration=1):
 		self.check_source(source)
 		seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
 		elapsed_time = 0
 		print(f"Adjust ambient noise {duration}")
 		while True:
 			elapsed_time += seconds_per_buffer
 			if elapsed_time > duration:
 				break
 			buffer = source.stream.read(source.CHUNK)
 			energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
 			self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
 	def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None):
 		self.check_source(source)
 		seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
 		elapsed_time = 0
 		pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer))
 		phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer))
 		non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer))
 		buffer = b""
 		pause_cnt = 0
 		phrase_cnt = 0
 		timed_out = False
 		while True:
 			frames = collections.deque()
 			while True:
 				elapsed_time += seconds_per_buffer
 				if listen_timeout and elapsed_time > listen_timeout:
 					raise WaitTimeoutError("Listener timed out while waiting for input")
 					# # print("timeout")
 					# if phrase_cnt > 0:
 					# 	timed_out = True
 					# 	break
 					# else:
 					# 	raise WaitTimeoutError("Listener timed out while waiting for input")
 				buffer = source.stream.read(source.CHUNK)
 				if len(buffer) == 0:
 					break
 				frames.append(buffer)
 				if len(frames) > non_speaking_buffer_cnt:
 					frames.popleft()
 				energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
 				###############################################
 				# print(f"{energy} - {self.energy_threshold}")
 				if energy > self.energy_threshold:
 					break
 				if self.dynamic_energy_threshold:
 					print("dynamic_thresholding")
 					self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
 			if timed_out:
 				break					
 			phrase_start_time = elapsed_time
 			if is_listening_cb:
 				is_listening_cb(True)
 			while True:
 				elapsed_time += seconds_per_buffer
 				if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout:
 					break
 				buffer = source.stream.read(source.CHUNK)
 				if len(buffer) == 0:
 					break
 				frames.append(buffer)
 				energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
 				if energy > self.energy_threshold:
 					pause_cnt = 0
 					phrase_cnt += 1
 					# print(f"phrase {phrase_cnt}")
 				else:
 					pause_cnt += 1
 					# print(f"pause {pause_cnt}")
 				if pause_cnt > pause_buffer_cnt:
 					# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
 					break
 			# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
 			if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
 				break
 			# 	frames.append(buffer)
 			# 	phrase_cnt += 1
 			# 	print(f"phrase {phrase_cnt}")
 			# 	energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
 			# 	if energy > self.energy_threshold:
 			# 		pause_cnt = 0
 			# 	else:
 			# 		pause_cnt += 1
 			# 		print(f"pause {pause_cnt}")
 			# 	if pause_cnt > pause_buffer_cnt:
 			# 		print(f"pause {pause_cnt} - {pause_buffer_cnt}")
 			# 		break
 			# phrase_cnt -= pause_cnt
 			# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
 			# if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
 			# 	break
 		if is_listening_cb:
 			is_listening_cb(False)
 		if frames:
 			for i in range(pause_cnt - non_speaking_buffer_cnt):
 				frames.pop()
 		frame_data = b"".join(frames)
 		return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
 	def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None):
 		assert isinstance(source, AudioSource), "Source must be an AudioSource"
 		running = [True]
 		def listen_in_thread():
 			with source:
 				while running[0]:
 					try:
 						data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb)
 					except WaitTimeoutError:
 						is_listening_cb(False)
 						pass
 					else:
 						if running[0]:
 							listen_cb(self, data)
 		def stopper(wait_join_stop=True):
 			running[0] = False
 			if wait_join_stop:
 				listener_thread.join()
 		listener_thread = threading.Thread(target=listen_in_thread)
 		listener_thread.deamon = True
 		listener_thread.start()
 		return stopper
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
 numpy
 openai-whisper
 PyAudio
--- a/transcribe.py
+++ b/transcribe.py
@ -0,0 +1,142 @@
 import argparse
 import numpy
 import whisper
 import torch
 import wave
 import os
 from datetime import datetime, timedelta
 from time import sleep
 from queue import Queue
 from enum import Enum
 import audio
 class State(Enum):
 	IDLE = 1
 	TRANSCRIBING = 2
 	LISTENING = 3
 state = State.IDLE
 def main():
 	p = argparse.ArgumentParser(description="TRANSCRIPTUM")
 	p.add_argument("--model", default="medium", help="Whisper model", choices=["tiny", "base", "small", "medium", "large"])
 	p.add_argument("--rms", default=1000, help="RMS (energy) threshold for microphone to detect", type=int)	
 	p.add_argument("--record_timeout", default=8, help="Timeout for the microphone recording", type=float)	
 	p.add_argument("--phrase_timeout", default=2, help="Silence timeout between phrases", type=float)	
 	p.add_argument("--dynamic_threshold", action="store_true", help="Use dynamic rms threshold?")
 	args = p.parse_args()
 	record_timeout = args.record_timeout
 	phrase_timeout = args.phrase_timeout
 	dynamic_threshold = args.dynamic_threshold
 	phrase_time = None
 	data_queue = Queue()
 	transcripts = ['']
 	model = args.model
 	whisper_model = whisper.load_model(model)
 	transcribing = False
 	print("Model loaded.\n")
 	# select microphone?
 	source = audio.Microphone.select()
 	microphone = audio.Microphone(device_info=source, sample_rate=22050)
 	listener = audio.Listener()
 	listener.energy_threshold = args.rms
 	listener.dynamic_energy_threshold = args.dynamic_threshold
 	# with microphone:
 	# 	listener.adjust_ambient_noise(microphone, duration=1)
 	def print_transcripts(bcolor=None):
 		os.system("clear")
 		for l in transcripts:
 			if bcolor:
 				print(bcolor + l + '\033[0m')
 			else:
 				print(l)
 		print('', end='', flush=True)
 	# (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
 	def listen_callback(_, buffer:tuple) -> None:
 		data_queue.put(buffer[0])
 	def is_listening_callback(is_listening):
 		global state
 		if is_listening and state != State.LISTENING:
 			print_transcripts('\033[1m') #bold
 			state = State.LISTENING
 		elif state != State.IDLE and state != State.TRANSCRIBING:
 			print_transcripts()
 			state = State.IDLE
 	stop = listener.listen_in_background(source=microphone, listen_cb=listen_callback, listen_timeout=record_timeout, is_listening_cb=is_listening_callback)
 	os.system("clear")
 	while True:
 		try:
 			now = datetime.utcnow()
 			if not data_queue.empty():
 				phrase_complete = False
 				if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
 					phrase_complete = True
 				# with data_queue.mutex:
 				# 	phrase_time = now
 				# 	audio_data = b"".join(data_queue.queue)
 				# 	data_queue.queue.clear()
 				phrase_time = now
 				audio_data = b"".join(data_queue.queue)
 				data_queue.queue.clear()
 				np_data = numpy.frombuffer(audio_data, dtype=numpy.int16).astype(numpy.float32)  / 32768.0
 				# file_name = f"sound{n}.wav"
 				# with wave.open(file_name, "w") as f:
 				# 	f.setnchannels(1)
 				# 	f.setsampwidth(2)
 				# 	f.setframerate(22050)
 				# 	f.writeframes(audio_data)
 				# n += 1
 				state = State.LISTENING
 				# print_transcripts('\033[4m') #underline				
 				print_transcripts('\033[93m') #warning
 				r = whisper_model.transcribe(np_data, fp16=torch.cuda.is_available())				
 				t = r['text'].strip()
 				if len(t) > 0:
 					if phrase_complete:
 						transcripts.append(t)
 					else:
 						transcripts[-1] = t
 					print_transcripts()
 				state = State.IDLE
 				sleep(0.25)
 		except KeyboardInterrupt:
 			break
 	stop(True)		
 	print("\nTranscripts:\n")
 	for l in transcripts:
 		print(l)
 if __name__ == "__main__":
    main()
--- a/transcriptum.sh
+++ b/transcriptum.sh
@ -0,0 +1,31 @@
 #!/bin/bash
 if ! [[ "$1" =~ ^(install|clean|run) ]]; then
 	echo "usage: $0 [action]"
 	echo "where action can be: [install, clean, run]"
 	exit 1
 fi
 case $1 in
 	install)
 		echo "intalling virtual environment"
 		python -m venv venv
 		source venv/bin/activate
 		pip install --upgrade pip
 		pip install -r requirements.txt
 		;;
 	clean)
 		echo "cleaning up"
 		rm -rf venv
 		rm -rf __pycache__
 		;;
 	run)
 		echo "running"
 		source venv/bin/activate
 		exec python transcribe.py "${@:2}"
 esac
		`@ -0,0 +1 @@`
							`Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b`