]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/glomex.py
ec3c35c6f5ae6297177ccd2f7685f9b871c7c104
[yt-dlp.git] / yt_dlp / extractor / glomex.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5 import urllib.parse
6
7 from .common import InfoExtractor
8 from ..utils import (
9 determine_ext,
10 ExtractorError,
11 int_or_none,
12 parse_qs,
13 smuggle_url,
14 unescapeHTML,
15 unsmuggle_url,
16 )
17
18
19 class GlomexBaseIE(InfoExtractor):
20 _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
21 _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
22
23 @staticmethod
24 def _smuggle_origin_url(url, origin_url):
25 if origin_url is None:
26 return url
27 return smuggle_url(url, {'origin': origin_url})
28
29 @classmethod
30 def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
31 defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
32 unsmuggled_url, data = unsmuggle_url(url, default=defaults)
33 return unsmuggled_url, data['origin']
34
35 def _get_videoid_type(self, video_id):
36 _VIDEOID_TYPES = {
37 'v': 'video',
38 'pl': 'playlist',
39 'rl': 'related videos playlist',
40 'cl': 'curated playlist',
41 }
42 prefix = video_id.split('-')[0]
43 return _VIDEOID_TYPES.get(prefix, 'unknown type')
44
45 def _download_api_data(self, video_id, integration, current_url=None):
46 query = {
47 'integration_id': integration,
48 'playlist_id': video_id,
49 'current_url': current_url or self._DEFAULT_ORIGIN_URL,
50 }
51 video_id_type = self._get_videoid_type(video_id)
52 return self._download_json(
53 self._API_URL,
54 video_id, 'Downloading %s JSON' % video_id_type,
55 'Unable to download %s JSON' % video_id_type,
56 query=query)
57
58 def _download_and_extract_api_data(self, video_id, integration, current_url):
59 api_data = self._download_api_data(video_id, integration, current_url)
60 videos = api_data['videos']
61 if not videos:
62 raise ExtractorError('no videos found for %s' % video_id)
63 videos = [self._extract_api_data(video, video_id) for video in videos]
64 return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
65
66 def _extract_api_data(self, video, video_id):
67 if video.get('error_code') == 'contentGeoblocked':
68 self.raise_geo_restricted(countries=video['geo_locations'])
69
70 formats, subs = [], {}
71 for format_id, format_url in video['source'].items():
72 ext = determine_ext(format_url)
73 if ext == 'm3u8':
74 formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
75 format_url, video_id, 'mp4', m3u8_id=format_id,
76 fatal=False)
77 formats.extend(formats_)
78 self._merge_subtitles(subs_, target=subs)
79 else:
80 formats.append({
81 'url': format_url,
82 'format_id': format_id,
83 })
84 if video.get('language'):
85 for fmt in formats:
86 fmt['language'] = video['language']
87 self._sort_formats(formats)
88
89 images = (video.get('images') or []) + [video.get('image') or {}]
90 thumbnails = [{
91 'id': image.get('id'),
92 'url': f'{image["url"]}/profile:player-960x540',
93 'width': 960,
94 'height': 540,
95 } for image in images if image.get('url')]
96 self._remove_duplicate_formats(thumbnails)
97
98 return {
99 'id': video.get('clip_id') or video_id,
100 'title': video.get('title'),
101 'description': video.get('description'),
102 'thumbnails': thumbnails,
103 'duration': int_or_none(video.get('clip_duration')),
104 'timestamp': video.get('created_at'),
105 'formats': formats,
106 'subtitles': subs,
107 }
108
109
110 class GlomexIE(GlomexBaseIE):
111 IE_NAME = 'glomex'
112 IE_DESC = 'Glomex videos'
113 _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
114 _INTEGRATION_ID = '19syy24xjn1oqlpc'
115
116 _TESTS = [{
117 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
118 'md5': 'cec33a943c4240c9cb33abea8c26242e',
119 'info_dict': {
120 'id': 'v-cb24uwg77hgh',
121 'ext': 'mp4',
122 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
123 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
124 'duration': 29600,
125 'timestamp': 1619895017,
126 'upload_date': '20210501',
127 },
128 }]
129
130 def _real_extract(self, url):
131 video_id = self._match_id(url)
132 return self.url_result(
133 GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url),
134 GlomexEmbedIE.ie_key(), video_id)
135
136
137 class GlomexEmbedIE(GlomexBaseIE):
138 IE_NAME = 'glomex:embed'
139 IE_DESC = 'Glomex embedded videos'
140 _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html'
141 _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/')
142 _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
143
144 _TESTS = [{
145 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
146 'md5': '68f259b98cc01918ac34180142fce287',
147 'info_dict': {
148 'id': 'v-cfa6lye0dkdd-sf',
149 'ext': 'mp4',
150 'timestamp': 1635337199,
151 'duration': 133080,
152 'upload_date': '20211027',
153 'description': 'md5:e741185fc309310ff5d0c789b437be66',
154 'title': 'md5:35647293513a6c92363817a0fb0a7961',
155 },
156 }, {
157 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
158 'info_dict': {
159 'id': 'rl-vcb49w1fb592p',
160 },
161 'playlist_count': 100,
162 }, {
163 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
164 'info_dict': {
165 'id': 'cl-bgqaata6aw8x',
166 },
167 'playlist_mincount': 2,
168 }]
169
170 @classmethod
171 def build_player_url(cls, video_id, integration, origin_url=None):
172 query_string = urllib.parse.urlencode({
173 'playlistId': video_id,
174 'integrationId': integration,
175 })
176 return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
177
178 @classmethod
179 def _extract_urls(cls, webpage, origin_url):
180 # in comparison with _VALID_URL:
181 # * make the scheme optional
182 # * simplify the query string part; after extracting iframe src, the URL will be matched again
183 VALID_SRC = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
184
185 # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
186 EMBED_RE = r'''(?x)(?:
187 <iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)|
188 <(?P<html_tag>glomex-player|div)(?:
189 data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
190 data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
191 data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)|
192 [^>]*?
193 )+>|
194 # naive parsing of inline scripts for hard-coded integration parameters
195 <(?P<script_tag>script)[^<]*?>(?:
196 (?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s*
197 (?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?|
198 (?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s*
199 (?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?|
200 (?:\s|.)*?
201 )+</script>
202 )''' % {'quot_re': r'["\']', 'url_re': VALID_SRC}
203
204 for mobj in re.finditer(EMBED_RE, webpage):
205 mdict = mobj.groupdict()
206 if mdict.get('url'):
207 url = unescapeHTML(mdict['url'])
208 if not cls.suitable(url):
209 continue
210 yield cls._smuggle_origin_url(url, origin_url)
211 elif mdict.get('html_tag'):
212 if mdict['html_tag'] == 'div' and not mdict.get('glomex_player'):
213 continue
214 if not mdict.get('video_id_html') or not mdict.get('integration_html'):
215 continue
216 yield cls.build_player_url(mdict['video_id_html'], mdict['integration_html'], origin_url)
217 elif mdict.get('script_tag'):
218 if not mdict.get('video_id_js') or not mdict.get('integration_js'):
219 continue
220 yield cls.build_player_url(mdict['video_id_js'], mdict['integration_js'], origin_url)
221
222 def _real_extract(self, url):
223 url, origin_url = self._unsmuggle_origin_url(url)
224 playlist_id = self._match_id(url)
225 integration = parse_qs(url).get('integrationId', [None])[0]
226 if not integration:
227 raise ExtractorError('No integrationId in URL', expected=True)
228 return self._download_and_extract_api_data(playlist_id, integration, origin_url)