extract YouTube subtitle using keyword search
키워드 서치 후 유튜브 자막 긁어오기(복수의 비디오, 선택 순위 지정)
This is a python code in Colab to extract subtitles in a Youtube (single) video after searching a keyword.
Mount drive in Colab
# Mount my Google Drive (storage)
from google.colab import drive
drive.mount('/content/gdrive')
# data dir
import os
data_dir = '/content/gdrive/MyDrive/YOUR FOLDER NAME' # Your data directory in Colab
os.listdir(data_dir)
YouTube Data API
!pip install youtube_transcript_api
# Keyword Query
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from oauth2client.tools import argparser
# transcribing api
from youtube_transcript_api import YouTubeTranscriptApi
# text cleaning
import pandas
import re
import string
Enter my API key
api_key = " "
##### Enter search keyword
SearchQuery = "Language Testing"
search video clips
# build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, API_KEY)
api_obj = build('youtube', 'v3', developerKey=api_key)
request = api_obj.search().list(
q = SearchQuery,
order = "viewCount", # viewCount, date, rating, relevance, title, videoCount
part = "snippet",
type = "video", # video, playlist, channel
maxResults=10) # first 10 video clips by view counts
response = request.execute()
extract video ids
video_ids = []
titles = []
i = 1 # Add numbering before "title"
for t in response["items"]:
video_ids += {t["id"]["videoId"]}
ttl = t["snippet"]["title"]
title = str(i) + "_" + ttl
# print(title)
exclude = ['/', ':', '(', ')', '|']
title = {''.join(ch for ch in title if ch not in exclude)}
# print(title)
titles += title
i += 1
video_titles = dict(zip(video_ids, titles))
print(video_titles)
extract subtitles function
### function
def make_transcript(video_id):
try:
subtitles = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
prompts = []
words = []
for subtitle in subtitles:
time = subtitle['start']
prompt = subtitle['text']
words.append(prompt)
prompts.append([time, prompt])
title = video_titles[video_id]
print(title)
with open(os.path.join(data_dir, SearchQuery + "_" + title + "_" + video_id + '.txt'), 'w') as file:
file.writelines(words)
file.close()
df = pandas.DataFrame(prompts, columns =['time', 'subtitle'])
df.to_excel(os.path.join(data_dir, SearchQuery + "_" + title + "_" + video_id +'.xlsx'), index=None)
except:
print("no_subtitle")
Run the function
for video_id in video_ids:
make_transcript(video_id)