]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/imdb.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / imdb.py
1 import base64
2 import json
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 int_or_none,
9 mimetype2ext,
10 qualities,
11 traverse_obj,
12 try_get,
13 url_or_none,
14 )
15
16
17 class ImdbIE(InfoExtractor):
18 IE_NAME = 'imdb'
19 IE_DESC = 'Internet Movie Database trailers'
20 _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P<id>\d+)'
21
22 _TESTS = [{
23 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
24 'info_dict': {
25 'id': '2524815897',
26 'ext': 'mp4',
27 'title': 'No. 2',
28 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
29 'duration': 152,
30 'thumbnail': r're:^https?://.+\.jpg',
31 },
32 }, {
33 'url': 'https://www.imdb.com/video/vi3516832537',
34 'info_dict': {
35 'id': '3516832537',
36 'ext': 'mp4',
37 'title': 'Paul: U.S. Trailer #1',
38 'description': 'md5:17fcc4fe11ec29b4399be9d4c5ef126c',
39 'duration': 153,
40 'thumbnail': r're:^https?://.+\.jpg',
41 },
42 }, {
43 'url': 'http://www.imdb.com/video/_/vi2524815897',
44 'only_matching': True,
45 }, {
46 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
47 'only_matching': True,
48 }, {
49 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
50 'only_matching': True,
51 }, {
52 'url': 'http://www.imdb.com/videoplayer/vi1562949145',
53 'only_matching': True,
54 }, {
55 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
56 'only_matching': True,
57 }, {
58 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329',
59 'only_matching': True,
60 }]
61
62 def _real_extract(self, url):
63 video_id = self._match_id(url)
64 webpage = self._download_webpage(f'https://www.imdb.com/video/vi{video_id}', video_id)
65 info = self._search_nextjs_data(webpage, video_id)
66 video_info = traverse_obj(info, ('props', 'pageProps', 'videoPlaybackData', 'video'), default={})
67 title = (traverse_obj(video_info, ('name', 'value'), ('primaryTitle', 'titleText', 'text'))
68 or self._html_search_meta(('og:title', 'twitter:title'), webpage, default=None)
69 or self._html_extract_title(webpage))
70 data = video_info.get('playbackURLs') or try_get(self._download_json(
71 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
72 query={
73 'key': base64.b64encode(json.dumps({
74 'type': 'VIDEO_PLAYER',
75 'subType': 'FORCE_LEGACY',
76 'id': f'vi{video_id}',
77 }).encode()).decode(),
78 }), lambda x: x[0]['videoLegacyEncodings'])
79 quality = qualities(('SD', '480p', '720p', '1080p'))
80 formats, subtitles = [], {}
81 for encoding in data:
82 if not encoding or not isinstance(encoding, dict):
83 continue
84 video_url = url_or_none(encoding.get('url'))
85 if not video_url:
86 continue
87 ext = mimetype2ext(encoding.get(
88 'mimeType')) or determine_ext(video_url)
89 if ext == 'm3u8':
90 fmts, subs = self._extract_m3u8_formats_and_subtitles(
91 video_url, video_id, 'mp4', entry_protocol='m3u8_native',
92 preference=1, m3u8_id='hls', fatal=False)
93 subtitles = self._merge_subtitles(subtitles, subs)
94 formats.extend(fmts)
95 continue
96 format_id = traverse_obj(encoding, ('displayName', 'value'), 'definition')
97 formats.append({
98 'format_id': format_id,
99 'url': video_url,
100 'ext': ext,
101 'quality': quality(format_id),
102 })
103
104 return {
105 'id': video_id,
106 'title': title,
107 'alt_title': info.get('videoSubTitle'),
108 'formats': formats,
109 'description': try_get(video_info, lambda x: x['description']['value']),
110 'thumbnail': url_or_none(try_get(video_info, lambda x: x['thumbnail']['url'])),
111 'duration': int_or_none(try_get(video_info, lambda x: x['runtime']['value'])),
112 'subtitles': subtitles,
113 }
114
115
116 class ImdbListIE(InfoExtractor):
117 IE_NAME = 'imdb:list'
118 IE_DESC = 'Internet Movie Database lists'
119 _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P<id>\d{9})(?!/videoplayer/vi\d+)'
120 _TEST = {
121 'url': 'https://www.imdb.com/list/ls009921623/',
122 'info_dict': {
123 'id': '009921623',
124 'title': 'The Bourne Legacy',
125 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.',
126 },
127 'playlist_count': 8,
128 }
129
130 def _real_extract(self, url):
131 list_id = self._match_id(url)
132 webpage = self._download_webpage(url, list_id)
133 entries = [
134 self.url_result('http://www.imdb.com' + m, 'Imdb')
135 for m in re.findall(rf'href="(/list/ls{list_id}/videoplayer/vi[^"]+)"', webpage)]
136
137 list_title = self._html_search_regex(
138 r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>',
139 webpage, 'list title')
140 list_description = self._html_search_regex(
141 r'<div[^>]+class="[^"]*list-description[^"]*"[^>]*><p>(.*?)</p>',
142 webpage, 'list description')
143
144 return self.playlist_result(entries, list_id, list_title, list_description)