haha

2024-04-24 09:42:52 +02:00 · 2024-04-24 09:42:52 +02:00 · 6754406c0c
commit 6754406c0c
parent 2095207a1d
6 changed files with 469 additions and 0 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "opaudio"]
+	path = opaudio
+	url = https://grrrit.le-club-des-sans-sujets.org/Le-Club-des-Sans-Sujets/opaudio.git
--- a/audio.py
+++ b/audio.py
@ -0,0 +1,289 @@
+import numpy
+import math
+import pyaudio as pa
+from opaudio import audioop
+import threading
+import collections
+
+
+class AudioSource(object):
+	def __init__(self):
+		raise NotImplementedError("this is an abstract class")
+
+	def __enter__(self):
+		raise NotImplementedError("this is an abstract class")
+
+	def __exit__(self, exc_type, exc_value, traceback):
+		raise NotImplementedError("this is an abstract class")
+
+class Microphone(AudioSource):
+
+	def __init__(self, device_info=None, sample_rate=None, chunk_size=1024):
+		assert device_info is not None, "device_info must not be None (see Microphone.select)"
+		a = pa.PyAudio()
+		try:			
+			if sample_rate is None:
+				assert isinstance(device_info.get('defaultSampleRate'), (float, int)) and device_info.get('defaultSampleRate') > 0, "Wrong sample rate provided by PyAudio"
+				sample_rate = int(device_info.get('defaultSampleRate'))
+			self.device_info = device_info
+		finally:
+			a.terminate()
+
+		self.format = pa.paInt16
+		self.SAMPLE_WIDTH = pa.get_sample_size(self.format)
+		self.SAMPLE_RATE = sample_rate
+		self.CHUNK = chunk_size
+
+		self.audio = None
+		self.stream = None
+
+	@staticmethod
+	def select():
+		n = 0
+		microphones = []
+		a = pa.PyAudio()
+		for i in range(a.get_device_count()):
+			d = a.get_device_info_by_index(i)
+			if(d.get('maxInputChannels') > 0):
+				microphones.append(d)
+				print(f"{n}. {d.get('name')}")
+				n += 1
+
+		while True:
+			sel = input("select microphone: ")
+			if not sel.isdigit() or int(sel) > n:
+				print("Wrong selection.")
+				continue
+			m = microphones[int(sel)]
+			a.terminate()
+			return m
+
+	def __enter__(self):
+		assert self.stream is None, "Source already streaming"
+		self.a = pa.PyAudio()
+		try:
+			pa_stream = self.a.open(input_device_index=self.device_info.get('index'), channels=1, format=self.format, rate=self.SAMPLE_RATE, frames_per_buffer=self.CHUNK, input=True)
+			self.stream = Microphone.Stream(pa_stream)
+		except Exception as e:
+			print(e)
+			self.a.terminate()
+		return self
+
+	def __exit__(self, exc_type, exc_value, traceback):
+		try:
+			self.stream.close()
+		finally:
+			self.stream = None
+			self.a.terminate()
+
+	class Stream(object):
+
+		def __init__(self, pa_stream):
+			self.pa_stream = pa_stream
+
+		def read(self, size):
+			return self.pa_stream.read(size, exception_on_overflow=False)
+
+		def close(self):
+			try:
+				if not self.pa_stream.is_stopped():
+					self.pa_stream.stop_stream()
+			finally:
+				self.pa_stream.close()
+
+class WaitTimeoutError(Exception):
+	pass
+
+class Listener(AudioSource):
+
+	def __init__(self):
+		self.energy_threshold = 300  # minimum audio energy to consider for recording
+		self.dynamic_energy_threshold = True
+		self.dynamic_energy_adjustment_damping = 0.15
+		self.dynamic_energy_ratio = 1.5
+		self.pause_threshold = 0.8  # seconds of non-speaking audio before a phrase is considered complete
+		self.operation_timeout = None  # seconds after an internal operation (e.g., an API request) starts before it times out, or ``None`` for no timeout
+		self.phrase_threshold = 0.5  # minimum seconds of speaking audio before we consider the speaking audio a phrase - values below this are ignored (for filtering out clicks and pops)
+		self.non_speaking_duration = 0.5  # seconds of non-speaking audio to keep on both sides of the recording
+
+	def check_source(self, source):    
+		assert isinstance(source, AudioSource), "Source must be an AudioSource"
+		assert source.stream is not None, "Source must be streaming"
+
+	def dynamic_thresholding(self, energy, buffer, seconds_per_buffer):
+			damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer
+			target_energy = energy * self.dynamic_energy_ratio
+			self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
+
+
+	def adjust_ambient_noise(self, source, duration=1):
+		self.check_source(source)
+
+		seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
+		elapsed_time = 0
+
+		print(f"Adjust ambient noise {duration}")
+
+		while True:
+			elapsed_time += seconds_per_buffer
+			if elapsed_time > duration:
+				break
+			buffer = source.stream.read(source.CHUNK)
+			energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
+
+			self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
+
+	def listen(self, source, listen_timeout=None, phrase_timeout=None, is_listening_cb=None):
+		self.check_source(source)
+
+		seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
+		elapsed_time = 0
+
+		pause_buffer_cnt = int(math.ceil(self.pause_threshold / seconds_per_buffer))
+		phrase_buffer_cnt = int(math.ceil(self.phrase_threshold / seconds_per_buffer))
+		non_speaking_buffer_cnt = int(math.ceil(self.non_speaking_duration / seconds_per_buffer))
+
+		buffer = b""
+
+		pause_cnt = 0
+		phrase_cnt = 0
+		timed_out = False
+
+		while True:
+			frames = collections.deque()
+
+			while True:
+				elapsed_time += seconds_per_buffer
+				if listen_timeout and elapsed_time > listen_timeout:
+					raise WaitTimeoutError("Listener timed out while waiting for input")
+					# # print("timeout")
+					# if phrase_cnt > 0:
+					# 	timed_out = True
+					# 	break
+					# else:
+					# 	raise WaitTimeoutError("Listener timed out while waiting for input")
+
+				buffer = source.stream.read(source.CHUNK)
+				if len(buffer) == 0:
+					break
+
+				frames.append(buffer)
+
+				if len(frames) > non_speaking_buffer_cnt:
+					frames.popleft()
+
+				energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
+
+				###############################################
+				# print(f"{energy} - {self.energy_threshold}")
+
+				if energy > self.energy_threshold:
+					break
+
+				if self.dynamic_energy_threshold:
+					print("dynamic_thresholding")
+					self.dynamic_thresholding(energy, buffer, seconds_per_buffer)
+
+			if timed_out:
+				break					
+
+			phrase_start_time = elapsed_time
+
+			if is_listening_cb:
+				is_listening_cb(True)
+
+			while True:
+				elapsed_time += seconds_per_buffer
+
+				if phrase_timeout and elapsed_time - phrase_start_time > phrase_timeout:
+					break
+
+				buffer = source.stream.read(source.CHUNK)
+				if len(buffer) == 0:
+					break
+
+				frames.append(buffer)
+
+				energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
+				if energy > self.energy_threshold:
+					pause_cnt = 0
+					phrase_cnt += 1
+					# print(f"phrase {phrase_cnt}")
+				else:
+					pause_cnt += 1
+					# print(f"pause {pause_cnt}")
+
+				if pause_cnt > pause_buffer_cnt:
+					# print(f"pause {pause_cnt} - {pause_buffer_cnt}")
+					break
+
+			# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
+			if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
+				break
+
+
+
+			# 	frames.append(buffer)
+			# 	phrase_cnt += 1
+			# 	print(f"phrase {phrase_cnt}")
+
+			# 	energy = audioop.rms(buffer, source.SAMPLE_WIDTH)
+			# 	if energy > self.energy_threshold:
+			# 		pause_cnt = 0
+			# 	else:
+			# 		pause_cnt += 1
+			# 		print(f"pause {pause_cnt}")
+
+			# 	if pause_cnt > pause_buffer_cnt:
+			# 		print(f"pause {pause_cnt} - {pause_buffer_cnt}")
+			# 		break
+
+			# phrase_cnt -= pause_cnt
+			# print(f"phrase count {phrase_cnt} - {phrase_buffer_cnt}")
+			# if phrase_cnt >= phrase_buffer_cnt or len(buffer) == 0:
+			# 	break
+
+		if is_listening_cb:
+			is_listening_cb(False)
+
+		if frames:
+			for i in range(pause_cnt - non_speaking_buffer_cnt):
+				frames.pop()
+
+		frame_data = b"".join(frames)
+
+		return (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
+
+	def listen_in_background(self, source, listen_cb, listen_timeout=None, is_listening_cb=None):
+		assert isinstance(source, AudioSource), "Source must be an AudioSource"
+		running = [True]
+
+		def listen_in_thread():
+			with source:
+				while running[0]:
+					try:
+						data = self.listen(source=source, listen_timeout=1, phrase_timeout=listen_timeout, is_listening_cb=is_listening_cb)
+					except WaitTimeoutError:
+						is_listening_cb(False)
+						pass
+					else:
+						if running[0]:
+							listen_cb(self, data)
+
+		def stopper(wait_join_stop=True):
+			running[0] = False
+			if wait_join_stop:
+				listener_thread.join()
+
+		listener_thread = threading.Thread(target=listen_in_thread)
+		listener_thread.deamon = True
+		listener_thread.start()
+		return stopper
+
+
+
+
+
+
+
+
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+numpy
+openai-whisper
+PyAudio
--- a/transcribe.py
+++ b/transcribe.py
@ -0,0 +1,142 @@
+import argparse
+import numpy
+import whisper
+import torch
+import wave
+import os
+
+from datetime import datetime, timedelta
+from time import sleep
+from queue import Queue
+from enum import Enum
+
+import audio
+
+class State(Enum):
+	IDLE = 1
+	TRANSCRIBING = 2
+	LISTENING = 3
+
+state = State.IDLE
+
+def main():
+	p = argparse.ArgumentParser(description="TRANSCRIPTUM")
+	p.add_argument("--model", default="medium", help="Whisper model", choices=["tiny", "base", "small", "medium", "large"])
+	p.add_argument("--rms", default=1000, help="RMS (energy) threshold for microphone to detect", type=int)	
+	p.add_argument("--record_timeout", default=8, help="Timeout for the microphone recording", type=float)	
+	p.add_argument("--phrase_timeout", default=2, help="Silence timeout between phrases", type=float)	
+	p.add_argument("--dynamic_threshold", action="store_true", help="Use dynamic rms threshold?")
+
+	args = p.parse_args()
+
+	record_timeout = args.record_timeout
+	phrase_timeout = args.phrase_timeout
+	dynamic_threshold = args.dynamic_threshold
+	phrase_time = None
+
+	data_queue = Queue()
+	transcripts = ['']
+
+	model = args.model
+	whisper_model = whisper.load_model(model)
+	transcribing = False
+	print("Model loaded.\n")
+
+	# select microphone?
+	source = audio.Microphone.select()
+	microphone = audio.Microphone(device_info=source, sample_rate=22050)
+
+	listener = audio.Listener()
+	listener.energy_threshold = args.rms
+	listener.dynamic_energy_threshold = args.dynamic_threshold
+
+	# with microphone:
+	# 	listener.adjust_ambient_noise(microphone, duration=1)
+
+	def print_transcripts(bcolor=None):
+		os.system("clear")
+		for l in transcripts:
+			if bcolor:
+				print(bcolor + l + '\033[0m')
+			else:
+				print(l)
+		print('', end='', flush=True)
+
+	# (frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
+	def listen_callback(_, buffer:tuple) -> None:
+		data_queue.put(buffer[0])
+
+	def is_listening_callback(is_listening):
+		global state
+		if is_listening and state != State.LISTENING:
+			print_transcripts('\033[1m') #bold
+			state = State.LISTENING
+		elif state != State.IDLE and state != State.TRANSCRIBING:
+			print_transcripts()
+			state = State.IDLE
+
+	stop = listener.listen_in_background(source=microphone, listen_cb=listen_callback, listen_timeout=record_timeout, is_listening_cb=is_listening_callback)
+
+	os.system("clear")
+
+	while True:
+		try:
+			now = datetime.utcnow()
+
+			if not data_queue.empty():
+
+				phrase_complete = False
+				if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
+					phrase_complete = True
+
+				# with data_queue.mutex:
+				# 	phrase_time = now
+				# 	audio_data = b"".join(data_queue.queue)
+				# 	data_queue.queue.clear()
+
+				phrase_time = now
+				audio_data = b"".join(data_queue.queue)
+				data_queue.queue.clear()
+
+				np_data = numpy.frombuffer(audio_data, dtype=numpy.int16).astype(numpy.float32)  / 32768.0
+
+				# file_name = f"sound{n}.wav"
+				# with wave.open(file_name, "w") as f:
+				# 	f.setnchannels(1)
+				# 	f.setsampwidth(2)
+				# 	f.setframerate(22050)
+				# 	f.writeframes(audio_data)
+				# n += 1
+
+				state = State.LISTENING
+				# print_transcripts('\033[4m') #underline				
+				print_transcripts('\033[93m') #warning
+				r = whisper_model.transcribe(np_data, fp16=torch.cuda.is_available())				
+
+				t = r['text'].strip()
+
+				if len(t) > 0:
+					if phrase_complete:
+						transcripts.append(t)
+					else:
+						transcripts[-1] = t
+
+					print_transcripts()
+
+				state = State.IDLE
+
+				sleep(0.25)
+				
+		except KeyboardInterrupt:
+			break
+
+		
+
+	stop(True)		
+
+	print("\nTranscripts:\n")
+	for l in transcripts:
+		print(l)
+
+if __name__ == "__main__":
+    main()
--- a/transcriptum.sh
+++ b/transcriptum.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+if ! [[ "$1" =~ ^(install|clean|run) ]]; then
+	echo "usage: $0 [action]"
+	echo "where action can be: [install, clean, run]"
+	exit 1
+fi
+
+
+case $1 in
+
+	install)
+		echo "intalling virtual environment"
+		python -m venv venv
+		source venv/bin/activate
+		pip install --upgrade pip
+		pip install -r requirements.txt
+		;;
+	clean)
+		echo "cleaning up"
+		rm -rf venv
+		rm -rf __pycache__
+		;;
+	run)
+		echo "running"
+		source venv/bin/activate
+		exec python transcribe.py "${@:2}"
+
+esac
+
+
				`@ -0,0 +1 @@`
				`Subproject commit 2e32c5e6bba76e75ed947ebf5bdf2467f9683a4b`