]>
Commit | Line | Data |
---|---|---|
71738b14 ZM |
1 | # coding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | import urllib.parse | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..utils import ( | |
9 | determine_ext, | |
10 | ExtractorError, | |
11 | int_or_none, | |
12 | parse_qs, | |
13 | smuggle_url, | |
14 | unescapeHTML, | |
15 | unsmuggle_url, | |
16 | ) | |
17 | ||
18 | ||
19 | class GlomexBaseIE(InfoExtractor): | |
20 | _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/' | |
21 | _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/' | |
22 | ||
23 | @staticmethod | |
24 | def _smuggle_origin_url(url, origin_url): | |
25 | if origin_url is None: | |
26 | return url | |
27 | return smuggle_url(url, {'origin': origin_url}) | |
28 | ||
29 | @classmethod | |
30 | def _unsmuggle_origin_url(cls, url, fallback_origin_url=None): | |
31 | defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL} | |
32 | unsmuggled_url, data = unsmuggle_url(url, default=defaults) | |
33 | return unsmuggled_url, data['origin'] | |
34 | ||
35 | def _get_videoid_type(self, video_id): | |
36 | _VIDEOID_TYPES = { | |
37 | 'v': 'video', | |
38 | 'pl': 'playlist', | |
39 | 'rl': 'related videos playlist', | |
40 | 'cl': 'curated playlist', | |
41 | } | |
42 | prefix = video_id.split('-')[0] | |
43 | return _VIDEOID_TYPES.get(prefix, 'unknown type') | |
44 | ||
45 | def _download_api_data(self, video_id, integration, current_url=None): | |
46 | query = { | |
47 | 'integration_id': integration, | |
48 | 'playlist_id': video_id, | |
49 | 'current_url': current_url or self._DEFAULT_ORIGIN_URL, | |
50 | } | |
51 | video_id_type = self._get_videoid_type(video_id) | |
52 | return self._download_json( | |
53 | self._API_URL, | |
54 | video_id, 'Downloading %s JSON' % video_id_type, | |
55 | 'Unable to download %s JSON' % video_id_type, | |
56 | query=query) | |
57 | ||
58 | def _download_and_extract_api_data(self, video_id, integration, current_url): | |
59 | api_data = self._download_api_data(video_id, integration, current_url) | |
60 | videos = api_data['videos'] | |
61 | if not videos: | |
62 | raise ExtractorError('no videos found for %s' % video_id) | |
63 | videos = [self._extract_api_data(video, video_id) for video in videos] | |
64 | return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id) | |
65 | ||
66 | def _extract_api_data(self, video, video_id): | |
67 | if video.get('error_code') == 'contentGeoblocked': | |
68 | self.raise_geo_restricted(countries=video['geo_locations']) | |
69 | ||
70 | formats, subs = [], {} | |
71 | for format_id, format_url in video['source'].items(): | |
72 | ext = determine_ext(format_url) | |
73 | if ext == 'm3u8': | |
74 | formats_, subs_ = self._extract_m3u8_formats_and_subtitles( | |
75 | format_url, video_id, 'mp4', m3u8_id=format_id, | |
76 | fatal=False) | |
77 | formats.extend(formats_) | |
fdf80059 | 78 | self._merge_subtitles(subs_, target=subs) |
71738b14 ZM |
79 | else: |
80 | formats.append({ | |
81 | 'url': format_url, | |
82 | 'format_id': format_id, | |
83 | }) | |
84 | if video.get('language'): | |
85 | for fmt in formats: | |
86 | fmt['language'] = video['language'] | |
87 | self._sort_formats(formats) | |
88 | ||
89 | images = (video.get('images') or []) + [video.get('image') or {}] | |
90 | thumbnails = [{ | |
91 | 'id': image.get('id'), | |
92 | 'url': f'{image["url"]}/profile:player-960x540', | |
93 | 'width': 960, | |
94 | 'height': 540, | |
95 | } for image in images if image.get('url')] | |
96 | self._remove_duplicate_formats(thumbnails) | |
97 | ||
98 | return { | |
99 | 'id': video.get('clip_id') or video_id, | |
100 | 'title': video.get('title'), | |
101 | 'description': video.get('description'), | |
102 | 'thumbnails': thumbnails, | |
103 | 'duration': int_or_none(video.get('clip_duration')), | |
104 | 'timestamp': video.get('created_at'), | |
105 | 'formats': formats, | |
106 | 'subtitles': subs, | |
107 | } | |
108 | ||
109 | ||
110 | class GlomexIE(GlomexBaseIE): | |
111 | IE_NAME = 'glomex' | |
112 | IE_DESC = 'Glomex videos' | |
113 | _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)' | |
114 | _INTEGRATION_ID = '19syy24xjn1oqlpc' | |
115 | ||
116 | _TESTS = [{ | |
117 | 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel', | |
118 | 'md5': 'cec33a943c4240c9cb33abea8c26242e', | |
119 | 'info_dict': { | |
120 | 'id': 'v-cb24uwg77hgh', | |
121 | 'ext': 'mp4', | |
122 | 'title': 'md5:38a90cedcfadd72982c81acf13556e0c', | |
123 | 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8', | |
124 | 'duration': 29600, | |
125 | 'timestamp': 1619895017, | |
126 | 'upload_date': '20210501', | |
127 | }, | |
128 | }] | |
129 | ||
130 | def _real_extract(self, url): | |
131 | video_id = self._match_id(url) | |
132 | return self.url_result( | |
133 | GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url), | |
134 | GlomexEmbedIE.ie_key(), video_id) | |
135 | ||
136 | ||
137 | class GlomexEmbedIE(GlomexBaseIE): | |
138 | IE_NAME = 'glomex:embed' | |
139 | IE_DESC = 'Glomex embedded videos' | |
140 | _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html' | |
141 | _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/') | |
142 | _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)' | |
143 | ||
144 | _TESTS = [{ | |
145 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', | |
146 | 'md5': '68f259b98cc01918ac34180142fce287', | |
147 | 'info_dict': { | |
148 | 'id': 'v-cfa6lye0dkdd-sf', | |
149 | 'ext': 'mp4', | |
150 | 'timestamp': 1635337199, | |
151 | 'duration': 133080, | |
152 | 'upload_date': '20211027', | |
153 | 'description': 'md5:e741185fc309310ff5d0c789b437be66', | |
154 | 'title': 'md5:35647293513a6c92363817a0fb0a7961', | |
155 | }, | |
156 | }, { | |
157 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0', | |
158 | 'info_dict': { | |
159 | 'id': 'rl-vcb49w1fb592p', | |
160 | }, | |
161 | 'playlist_count': 100, | |
162 | }, { | |
163 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc', | |
164 | 'info_dict': { | |
165 | 'id': 'cl-bgqaata6aw8x', | |
166 | }, | |
167 | 'playlist_mincount': 2, | |
168 | }] | |
169 | ||
170 | @classmethod | |
171 | def build_player_url(cls, video_id, integration, origin_url=None): | |
172 | query_string = urllib.parse.urlencode({ | |
173 | 'playlistId': video_id, | |
174 | 'integrationId': integration, | |
175 | }) | |
176 | return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) | |
177 | ||
178 | @classmethod | |
179 | def _extract_urls(cls, webpage, origin_url): | |
71738b14 ZM |
180 | VALID_SRC = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' |
181 | ||
182 | # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ | |
183 | EMBED_RE = r'''(?x)(?: | |
184 | <iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)| | |
185 | <(?P<html_tag>glomex-player|div)(?: | |
186 | data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)| | |
187 | data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)| | |
188 | data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)| | |
189 | [^>]*? | |
190 | )+>| | |
191 | # naive parsing of inline scripts for hard-coded integration parameters | |
192 | <(?P<script_tag>script)[^<]*?>(?: | |
193 | (?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s* | |
194 | (?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?| | |
195 | (?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s* | |
196 | (?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?| | |
197 | (?:\s|.)*? | |
198 | )+</script> | |
199 | )''' % {'quot_re': r'["\']', 'url_re': VALID_SRC} | |
200 | ||
201 | for mobj in re.finditer(EMBED_RE, webpage): | |
202 | mdict = mobj.groupdict() | |
203 | if mdict.get('url'): | |
204 | url = unescapeHTML(mdict['url']) | |
71738b14 ZM |
205 | if not cls.suitable(url): |
206 | continue | |
207 | yield cls._smuggle_origin_url(url, origin_url) | |
208 | elif mdict.get('html_tag'): | |
209 | if mdict['html_tag'] == 'div' and not mdict.get('glomex_player'): | |
210 | continue | |
211 | if not mdict.get('video_id_html') or not mdict.get('integration_html'): | |
212 | continue | |
213 | yield cls.build_player_url(mdict['video_id_html'], mdict['integration_html'], origin_url) | |
214 | elif mdict.get('script_tag'): | |
215 | if not mdict.get('video_id_js') or not mdict.get('integration_js'): | |
216 | continue | |
217 | yield cls.build_player_url(mdict['video_id_js'], mdict['integration_js'], origin_url) | |
218 | ||
219 | def _real_extract(self, url): | |
220 | url, origin_url = self._unsmuggle_origin_url(url) | |
221 | playlist_id = self._match_id(url) | |
222 | integration = parse_qs(url).get('integrationId', [None])[0] | |
223 | if not integration: | |
224 | raise ExtractorError('No integrationId in URL', expected=True) | |
225 | return self._download_and_extract_api_data(playlist_id, integration, origin_url) |