-
Notifications
You must be signed in to change notification settings - Fork 0
/
youtube.py
129 lines (107 loc) · 4.4 KB
/
youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pytube import YouTube
import xml.etree.ElementTree as ET
from audioToText import generateNoCaption
import os
def get_title(yt_link):
"""Returns the title of the Youtube video in the given link.
"""
try :
yt = YouTube(yt_link)
except:
raise ValueError("Invalid URL")
return yt.title
def get_caption(yt_link, vid_id):
"""Given a YouTube link, downloads the audio of the video, and returns the lyrics of the song.
The link `yt_link` should start with `https://`, and can be in the form `https://youtube.com/watch` or `https://youtu.be/`.
For example: `https://youtu.be/u9Dg-g7t2l4` or `https://youtube.com/watch?v=u9Dg-g7t2l4`.
The ID `vid_id` is internally created in this server. This must be used to name the audio downloaded from Youtube.
The downloaded audio must be a `.mp4` file located in `audio` folder.
The name of the audio file must be the `vid_id` given, i.e. `f"{vid_id}.mp4"`.
The returned value must be a tuple of two values: `(Youtube title, lyrics)`.
The `lyrics` must be an array, each element represents a sentence in the song.
Each element must have the following keys:
* `start`: The starting time of the sentence, in seconds.
* `duration`: The duration of the sentence, in seconds.
* `text`: The content of the sentence.
Note that this info is readily available in Youtube xml caption tree.
For example:
```
[
{
"start": 1.395,
"duration": 4.931,
"text": "\u266a Just gonna stand there\nand watch me burn \u266a"
},
{
"start": 6.326,
"duration": 5.000,
"text": "\u266a Well, that's alright, because\nI like the way it hurts \u266a"
},
]
```
(if time permits) raise ValueError if the Youtube video duration > 10min.
This is to prevent users from clogging the server with a very long Youtube video.
"""
yt = YouTube(yt_link)
audio_streams = yt.streams.filter(only_audio=True, subtype="mp4")
if len(audio_streams) == 0:
raise ValueError("Video does not have any MP4 stream.")
if yt.length > 10 * 60:
raise ValueError("Video is longer than 10 mins.")
# download video
filename = f"{vid_id}.mp4"
audio_streams[0].download(output_path=os.path.join(os.path.dirname(__file__), "audio"), filename=filename)
keys = yt.captions.keys()
captions = None
for caption in keys:
key = str(caption)
if "English" in key or key == "en" or "en-" in key or "en -" in key:
try:
xml = yt.captions.get(caption.code).xml_captions
tree = ET.ElementTree(ET.fromstring(xml))
captions = preprocess_lines(tree.getroot().find("body"))
print("Captions retrieved from YouTube")
break
except Exception as e:
print(e)
print(f"For key: {key}")
if captions is None:
print("Auto-generating captions")
captions = generateNoCaption(filename=filename)
return yt.title, captions
def preprocess_lines(body):
'''
Combine lines shorter than 5 seconds / 3 words.
If it's last line, combine with previous line.
'''
lines = []
line_template = {
"start": 0,
"duration": 0,
"text": ""
}
shortest_duration_threshold = 3 # 3 seconds
shortest_text_length_threshold = 2
curr_line = line_template.copy()
for line_dict in body:
if curr_line["duration"] == 0:
curr_line["start"] = float(line_dict.attrib['t']) / 1000
curr_line["duration"] += float(line_dict.attrib['d']) / 1000
text = line_dict.text.replace('♪', '').replace('\n', ' ').replace(' ', ' ').strip()
if curr_line["text"] != "":
curr_line["text"] += " " + text
else:
curr_line["text"] = text
if curr_line["duration"] < shortest_duration_threshold or len(curr_line["text"].split(" ")) < shortest_text_length_threshold:
continue
else:
lines.append(curr_line)
curr_line = line_template.copy()
if curr_line["duration"] != 0:
lines[-1]["duration"] += curr_line["duration"]
lines[-1]["text"] += " " + curr_line["text"]
return lines
def test():
print(get_caption('https://www.youtube.com/watch?v=cMg8KaMdDYo', 'sample'))
if __name__ == '__main__':
test()