Initial commit

2024-08-13 17:18:24 -04:00
commit 6665dec410
5 changed files with 391 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,162 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/main.py
+++ b/main.py
@@ -0,0 +1,60 @@
 import whisper
 import ffmpeg
 import json
 from pyannote.audio import Pipeline
 from pyannote.core import Segment
 def transcribe_video_to_json(video_path, diarization_pipeline):
    # Load Whisper model
    model = whisper.load_model("base")
    # Extract audio from video
    audio_path = "audio.wav"
    ffmpeg.input(video_path).output(audio_path).run(overwrite_output=True)
    # Perform speaker diarization
    diarization = diarization_pipeline(audio_path)
    # Transcribe the video
    print("transcribing")
    result = model.transcribe(video_path, language="en", verbose=True)
    print("transcribing done")
    # Prepare the JSON data
    json_data = []
    for segment in result['segments']:
        # Find the speaker for the current segment
        start_time = segment['start']
        end_time = segment['end']
        current_segment = Segment(start_time, end_time)
        # Get the speaker for this time interval
        speaker = None
        for turn, _, speaker_label in diarization.itertracks(yield_label=True):
            if turn.intersects(current_segment):
                speaker = speaker_label
                break
        if speaker is None:
            speaker = "unknown"  # Handle cases where no speaker is found
        json_data.append({
            "speaker": speaker,
            "start_time": start_time,
            "end_time": end_time,
            "text": segment['text']
        })
    # Save the transcription to a JSON file
    with open('transcription.json', 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
    print("Transcription saved to transcription.json")
 # Load the diarization pipeline
 diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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")
 # Run the transcription with diarization
 transcribe_video_to_json('input.mp4', diarization_pipeline)
--- a/pyvenv.cfg
+++ b/pyvenv.cfg
@@ -0,0 +1,5 @@
 home = /opt/homebrew/opt/python@3.12/bin
 include-system-site-packages = false
 version = 3.12.3
 executable = /opt/homebrew/Cellar/python@3.12/3.12.3/Frameworks/Python.framework/Versions/3.12/bin/python3.12
 command = /opt/homebrew/opt/python@3.12/bin/python3.12 -m venv /Users/tanishqdubey/projects/diarizejson
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,108 @@
 aiohappyeyeballs==2.3.5
 aiohttp==3.10.3
 aiosignal==1.3.1
 alembic==1.13.2
 antlr4-python3-runtime==4.9.3
 asteroid-filterbanks==0.4.0
 attrs==24.2.0
 audioread==3.0.1
 certifi==2024.7.4
 cffi==1.17.0
 charset-normalizer==3.3.2
 click==8.1.7
 colorlog==6.8.2
 contourpy==1.2.1
 cycler==0.12.1
 decorator==4.4.2
 docopt==0.6.2
 einops==0.8.0
 ffmpeg-python==0.2.0
 filelock==3.15.4
 fonttools==4.53.1
 frozenlist==1.4.1
 fsspec==2024.6.1
 future==1.0.0
 huggingface-hub==0.24.5
 HyperPyYAML==1.2.2
 idna==3.7
 imageio==2.35.0
 imageio-ffmpeg==0.5.1
 Jinja2==3.1.4
 joblib==1.4.2
 julius==0.2.7
 kiwisolver==1.4.5
 lazy_loader==0.4
 librosa==0.10.2.post1
 lightning==2.4.0
 lightning-utilities==0.11.6
 llvmlite==0.43.0
 Mako==1.3.5
 markdown-it-py==3.0.0
 MarkupSafe==2.1.5
 matplotlib==3.9.2
 mdurl==0.1.2
 more-itertools==10.4.0
 moviepy==1.0.3
 mpmath==1.3.0
 msgpack==1.0.8
 multidict==6.0.5
 networkx==3.3
 numba==0.60.0
 numpy==1.26.4
 omegaconf==2.3.0
 openai-whisper==20231117
 optuna==3.6.1
 packaging==24.1
 pandas==2.2.2
 pillow==10.4.0
 platformdirs==4.2.2
 pooch==1.8.2
 primePy==1.3
 proglog==0.1.10
 protobuf==5.27.3
 pyannote.audio==3.3.1
 pyannote.core==5.0.0
 pyannote.database==5.1.0
 pyannote.metrics==3.2.1
 pyannote.pipeline==3.0.1
 pycparser==2.22
 Pygments==2.18.0
 pyparsing==3.1.2
 python-dateutil==2.9.0.post0
 pytorch-lightning==2.4.0
 pytorch-metric-learning==2.6.0
 pytz==2024.1
 PyYAML==6.0.2
 regex==2024.7.24
 requests==2.32.3
 rich==13.7.1
 ruamel.yaml==0.18.6
 ruamel.yaml.clib==0.2.8
 scikit-learn==1.5.1
 scipy==1.14.0
 semver==3.0.2
 sentencepiece==0.2.0
 setuptools==72.1.0
 shellingham==1.5.4
 six==1.16.0
 sortedcontainers==2.4.0
 soundfile==0.12.1
 soxr==0.4.0
 speechbrain==1.0.0
 SQLAlchemy==2.0.32
 sympy==1.13.2
 tabulate==0.9.0
 tensorboardX==2.6.2.2
 threadpoolctl==3.5.0
 tiktoken==0.7.0
 torch==2.4.0
 torch-audiomentations==0.11.1
 torch-pitch-shift==1.2.4
 torchaudio==2.4.0
 torchmetrics==1.4.1
 tqdm==4.66.5
 typer==0.12.3
 typing_extensions==4.12.2
 tzdata==2024.1
 urllib3==2.2.2
 yarl==1.9.4
--- a/subtitle.py
+++ b/subtitle.py
@@ -0,0 +1,56 @@
 import json
 from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
 from moviepy.video.tools.subtitles import SubtitlesClip
 from PIL import ImageFont
 # Function to assign colors to speakers
 def get_speaker_color(speaker, color_map):
    if speaker not in color_map:
        color_map[speaker] = f"hsl({len(color_map) * 60 % 360}, 100%, 50%)"
    return color_map[speaker]
 # Function to parse the JSON and create subtitles
 def parse_subtitles(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    subtitles = []
    color_map = {}
    for entry in data:
        start_time = entry["start_time"]
        end_time = entry["end_time"]
        speaker = entry["speaker"]
        text = entry["text"]
        color = get_speaker_color(speaker, color_map)
        subtitles.append(((start_time, end_time), f"{speaker}: {text}", color))
    return subtitles
 # Function to generate text clips
 def subtitle_generator(txt, color):
    return TextClip(txt, fontsize=24, font='Arial', color=color, bg_color='black')
 # Main function to burn subtitles into the video
 def burn_subtitles(input_video, subtitle_json, output_video):
    video = VideoFileClip(input_video)
    subtitles_data = parse_subtitles(subtitle_json)
    # Create subtitle clips
    subtitle_clips = []
    for ((start, end), txt, color) in subtitles_data:
        subtitle_clip = (TextClip(txt, fontsize=24, color=color, font='Arial', bg_color='black')
                         .set_position(('center', 'bottom'))
                         .set_start(start)
                         .set_end(end))
        subtitle_clips.append(subtitle_clip)
    # Overlay subtitles on the video
    final = CompositeVideoClip([video] + subtitle_clips)
    final.write_videofile(output_video, codec="libx264")
 # Example usage
 burn_subtitles('input.mp4', 'subtitles.json', 'output.mp4')