diarizejson/main.py

import whisper
import ffmpeg
import json
from pyannote.audio import Pipeline
from pyannote.core import Segment


def transcribe_video_to_json(video_path, diarization_pipeline):
    # Load Whisper model
    model = whisper.load_model("base")

    # Extract audio from video
    audio_path = "audio.wav"
    ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)

    # Perform speaker diarization
    diarization = diarization_pipeline(audio_path)

    # Transcribe the video
    print("transcribing")
    result = model.transcribe(video_path, language="en", verbose=True)
    print("transcribing done")

    # Prepare the JSON data
    json_data = []
    for segment in result['segments']:
        # Find the speaker for the current segment
        start_time = segment['start']
        end_time = segment['end']
        current_segment = Segment(start_time, end_time)

        # Get the speaker for this time interval
        speaker = None
        for turn, _, speaker_label in diarization.itertracks(yield_label=True):
            if turn.intersects(current_segment):
                speaker = speaker_label
                break

        if speaker is None:
            speaker = "unknown"  # Handle cases where no speaker is found

        json_data.append({
            "speaker": speaker,
            "start_time": start_time,
            "end_time": end_time,
            "text": segment['text']
        })

    # Save the transcription to a JSON file
    with open('transcription.json', 'w') as json_file:
        json.dump(json_data, json_file, indent=4)

    print("Transcription saved to transcription.json")


# Load the diarization pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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")

# Run the transcription with diarization
transcribe_video_to_json('input.mp4', diarization_pipeline)