import whisper import ffmpeg import json from pyannote.audio import Pipeline from pyannote.core import Segment def transcribe_video_to_json(video_path, diarization_pipeline): # Load Whisper model model = whisper.load_model("base") # Extract audio from video audio_path = "audio.wav" ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True) # Perform speaker diarization diarization = diarization_pipeline(audio_path) # Transcribe the video print("transcribing") result = model.transcribe(video_path, language="en", verbose=True) print("transcribing done") # Prepare the JSON data json_data = [] for segment in result['segments']: # Find the speaker for the current segment start_time = segment['start'] end_time = segment['end'] current_segment = Segment(start_time, end_time) # Get the speaker for this time interval speaker = None for turn, _, speaker_label in diarization.itertracks(yield_label=True): if turn.intersects(current_segment): speaker = speaker_label break if speaker is None: speaker = "unknown" # Handle cases where no speaker is found json_data.append({ "speaker": speaker, "start_time": start_time, "end_time": end_time, "text": segment['text'] }) # Save the transcription to a JSON file with open('transcription.json', 'w') as json_file: json.dump(json_data, json_file, indent=4) print("Transcription saved to transcription.json") # Load the diarization pipeline diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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") # Run the transcription with diarization transcribe_video_to_json('input.mp4', diarization_pipeline)