61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
import whisper
|
|
import ffmpeg
|
|
import json
|
|
from pyannote.audio import Pipeline
|
|
from pyannote.core import Segment
|
|
|
|
|
|
def transcribe_video_to_json(video_path, diarization_pipeline):
|
|
# Load Whisper model
|
|
model = whisper.load_model("base")
|
|
|
|
# Extract audio from video
|
|
audio_path = "audio.wav"
|
|
ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)
|
|
|
|
# Perform speaker diarization
|
|
diarization = diarization_pipeline(audio_path)
|
|
|
|
# Transcribe the video
|
|
print("transcribing")
|
|
result = model.transcribe(video_path, language="en", verbose=True)
|
|
print("transcribing done")
|
|
|
|
# Prepare the JSON data
|
|
json_data = []
|
|
for segment in result['segments']:
|
|
# Find the speaker for the current segment
|
|
start_time = segment['start']
|
|
end_time = segment['end']
|
|
current_segment = Segment(start_time, end_time)
|
|
|
|
# Get the speaker for this time interval
|
|
speaker = None
|
|
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
|
|
if turn.intersects(current_segment):
|
|
speaker = speaker_label
|
|
break
|
|
|
|
if speaker is None:
|
|
speaker = "unknown" # Handle cases where no speaker is found
|
|
|
|
json_data.append({
|
|
"speaker": speaker,
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"text": segment['text']
|
|
})
|
|
|
|
# Save the transcription to a JSON file
|
|
with open('transcription.json', 'w') as json_file:
|
|
json.dump(json_data, json_file, indent=4)
|
|
|
|
print("Transcription saved to transcription.json")
|
|
|
|
|
|
# Load the diarization pipeline
|
|
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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")
|
|
|
|
# Run the transcription with diarization
|
|
transcribe_video_to_json('input.mp4', diarization_pipeline)
|