diarizejson/main.py
2024-08-13 17:18:24 -04:00

61 lines
1.9 KiB
Python

import whisper
import ffmpeg
import json
from pyannote.audio import Pipeline
from pyannote.core import Segment
def transcribe_video_to_json(video_path, diarization_pipeline):
# Load Whisper model
model = whisper.load_model("base")
# Extract audio from video
audio_path = "audio.wav"
ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)
# Perform speaker diarization
diarization = diarization_pipeline(audio_path)
# Transcribe the video
print("transcribing")
result = model.transcribe(video_path, language="en", verbose=True)
print("transcribing done")
# Prepare the JSON data
json_data = []
for segment in result['segments']:
# Find the speaker for the current segment
start_time = segment['start']
end_time = segment['end']
current_segment = Segment(start_time, end_time)
# Get the speaker for this time interval
speaker = None
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
if turn.intersects(current_segment):
speaker = speaker_label
break
if speaker is None:
speaker = "unknown" # Handle cases where no speaker is found
json_data.append({
"speaker": speaker,
"start_time": start_time,
"end_time": end_time,
"text": segment['text']
})
# Save the transcription to a JSON file
with open('transcription.json', 'w') as json_file:
json.dump(json_data, json_file, indent=4)
print("Transcription saved to transcription.json")
# Load the diarization pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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")
# Run the transcription with diarization
transcribe_video_to_json('input.mp4', diarization_pipeline)