extract YouTube subtitle using keyword search

February 19, 2022 1 minute read

키워드 서치 후 유튜브 자막 긁어오기(복수의 비디오, 선택 순위 지정)

This is a python code in Colab to extract subtitles in a Youtube (single) video after searching a keyword.

Mount drive in Colab

# Mount my Google Drive (storage)
from google.colab import drive
drive.mount('/content/gdrive')

# data dir
import os
data_dir = '/content/gdrive/MyDrive/YOUR FOLDER NAME'  # Your data directory in Colab 
os.listdir(data_dir)

YouTube Data API

!pip install youtube_transcript_api

# Keyword Query
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.tools import argparser

# transcribing api
from youtube_transcript_api import YouTubeTranscriptApi

# text cleaning
import pandas
import re 
import string

Enter my API key

api_key = "  "

##### Enter search keyword
SearchQuery = "Language Testing" 

search video clips

# build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, API_KEY)
api_obj = build('youtube', 'v3', developerKey=api_key)
request = api_obj.search().list(
    q = SearchQuery, 
    order = "viewCount",      # viewCount, date, rating, relevance, title, videoCount
    part = "snippet",
    type = "video",  # video, playlist, channel
    maxResults=10) # first 10 video clips by view counts
response = request.execute()

extract video ids

video_ids = []
titles = []
i = 1 # Add numbering before "title"
for t in response["items"]:
  video_ids += {t["id"]["videoId"]}
  ttl = t["snippet"]["title"]
  title = str(i) + "_" + ttl 
  # print(title)
  exclude = ['/', ':', '(', ')', '|']
  title = {''.join(ch for ch in title if ch not in exclude)}
  # print(title)
  titles += title
  i += 1
video_titles = dict(zip(video_ids, titles))
print(video_titles)

extract subtitles function

### function 
def make_transcript(video_id): 
  try: 
      subtitles = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])  
      prompts = []
      words = []
      
      for subtitle in subtitles:        
          time = subtitle['start']
          prompt = subtitle['text']
          words.append(prompt)
          prompts.append([time, prompt])
       
      title = video_titles[video_id]
      print(title)

      with open(os.path.join(data_dir, SearchQuery + "_" + title + "_" + video_id + '.txt'), 'w') as file: 
        file.writelines(words)
        file.close()

      df = pandas.DataFrame(prompts, columns =['time', 'subtitle'])
      df.to_excel(os.path.join(data_dir, SearchQuery + "_" + title + "_" + video_id +'.xlsx'), index=None)
  except:
    print("no_subtitle")

Run the function

for video_id in video_ids: 
  make_transcript(video_id)

Share on

Twitter Facebook LinkedIn

Yongkook Won

extract YouTube subtitle using keyword search

Mount drive in Colab

YouTube Data API

Enter my API key

search video clips

extract video ids

extract subtitles function

Run the function

Share on

You may also enjoy

Python File Rename Method

Pandas DataFrame ffill() Method

find sentences with a word list

extract YouTube subtitle: single video