]>
Commit | Line | Data |
---|---|---|
1 | # coding: utf-8 | |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | import urllib.parse | |
6 | ||
7 | from .common import InfoExtractor | |
8 | from ..utils import ( | |
9 | determine_ext, | |
10 | extract_attributes, | |
11 | ExtractorError, | |
12 | int_or_none, | |
13 | parse_qs, | |
14 | smuggle_url, | |
15 | unescapeHTML, | |
16 | unsmuggle_url, | |
17 | ) | |
18 | ||
19 | ||
20 | class GlomexBaseIE(InfoExtractor): | |
21 | _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/' | |
22 | _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/' | |
23 | ||
24 | @staticmethod | |
25 | def _smuggle_origin_url(url, origin_url): | |
26 | if origin_url is None: | |
27 | return url | |
28 | return smuggle_url(url, {'origin': origin_url}) | |
29 | ||
30 | @classmethod | |
31 | def _unsmuggle_origin_url(cls, url, fallback_origin_url=None): | |
32 | defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL} | |
33 | unsmuggled_url, data = unsmuggle_url(url, default=defaults) | |
34 | return unsmuggled_url, data['origin'] | |
35 | ||
36 | def _get_videoid_type(self, video_id): | |
37 | _VIDEOID_TYPES = { | |
38 | 'v': 'video', | |
39 | 'pl': 'playlist', | |
40 | 'rl': 'related videos playlist', | |
41 | 'cl': 'curated playlist', | |
42 | } | |
43 | prefix = video_id.split('-')[0] | |
44 | return _VIDEOID_TYPES.get(prefix, 'unknown type') | |
45 | ||
46 | def _download_api_data(self, video_id, integration, current_url=None): | |
47 | query = { | |
48 | 'integration_id': integration, | |
49 | 'playlist_id': video_id, | |
50 | 'current_url': current_url or self._DEFAULT_ORIGIN_URL, | |
51 | } | |
52 | video_id_type = self._get_videoid_type(video_id) | |
53 | return self._download_json( | |
54 | self._API_URL, | |
55 | video_id, 'Downloading %s JSON' % video_id_type, | |
56 | 'Unable to download %s JSON' % video_id_type, | |
57 | query=query) | |
58 | ||
59 | def _download_and_extract_api_data(self, video_id, integration, current_url): | |
60 | api_data = self._download_api_data(video_id, integration, current_url) | |
61 | videos = api_data['videos'] | |
62 | if not videos: | |
63 | raise ExtractorError('no videos found for %s' % video_id) | |
64 | videos = [self._extract_api_data(video, video_id) for video in videos] | |
65 | return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id) | |
66 | ||
67 | def _extract_api_data(self, video, video_id): | |
68 | if video.get('error_code') == 'contentGeoblocked': | |
69 | self.raise_geo_restricted(countries=video['geo_locations']) | |
70 | ||
71 | formats, subs = [], {} | |
72 | for format_id, format_url in video['source'].items(): | |
73 | ext = determine_ext(format_url) | |
74 | if ext == 'm3u8': | |
75 | formats_, subs_ = self._extract_m3u8_formats_and_subtitles( | |
76 | format_url, video_id, 'mp4', m3u8_id=format_id, | |
77 | fatal=False) | |
78 | formats.extend(formats_) | |
79 | self._merge_subtitles(subs_, target=subs) | |
80 | else: | |
81 | formats.append({ | |
82 | 'url': format_url, | |
83 | 'format_id': format_id, | |
84 | }) | |
85 | if video.get('language'): | |
86 | for fmt in formats: | |
87 | fmt['language'] = video['language'] | |
88 | self._sort_formats(formats) | |
89 | ||
90 | images = (video.get('images') or []) + [video.get('image') or {}] | |
91 | thumbnails = [{ | |
92 | 'id': image.get('id'), | |
93 | 'url': f'{image["url"]}/profile:player-960x540', | |
94 | 'width': 960, | |
95 | 'height': 540, | |
96 | } for image in images if image.get('url')] | |
97 | self._remove_duplicate_formats(thumbnails) | |
98 | ||
99 | return { | |
100 | 'id': video.get('clip_id') or video_id, | |
101 | 'title': video.get('title'), | |
102 | 'description': video.get('description'), | |
103 | 'thumbnails': thumbnails, | |
104 | 'duration': int_or_none(video.get('clip_duration')), | |
105 | 'timestamp': video.get('created_at'), | |
106 | 'formats': formats, | |
107 | 'subtitles': subs, | |
108 | } | |
109 | ||
110 | ||
111 | class GlomexIE(GlomexBaseIE): | |
112 | IE_NAME = 'glomex' | |
113 | IE_DESC = 'Glomex videos' | |
114 | _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)' | |
115 | _INTEGRATION_ID = '19syy24xjn1oqlpc' | |
116 | ||
117 | _TESTS = [{ | |
118 | 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel', | |
119 | 'md5': 'cec33a943c4240c9cb33abea8c26242e', | |
120 | 'info_dict': { | |
121 | 'id': 'v-cb24uwg77hgh', | |
122 | 'ext': 'mp4', | |
123 | 'title': 'md5:38a90cedcfadd72982c81acf13556e0c', | |
124 | 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8', | |
125 | 'duration': 29600, | |
126 | 'timestamp': 1619895017, | |
127 | 'upload_date': '20210501', | |
128 | }, | |
129 | }] | |
130 | ||
131 | def _real_extract(self, url): | |
132 | video_id = self._match_id(url) | |
133 | return self.url_result( | |
134 | GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url), | |
135 | GlomexEmbedIE.ie_key(), video_id) | |
136 | ||
137 | ||
138 | class GlomexEmbedIE(GlomexBaseIE): | |
139 | IE_NAME = 'glomex:embed' | |
140 | IE_DESC = 'Glomex embedded videos' | |
141 | _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html' | |
142 | _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/') | |
143 | _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)' | |
144 | ||
145 | _TESTS = [{ | |
146 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', | |
147 | 'md5': '68f259b98cc01918ac34180142fce287', | |
148 | 'info_dict': { | |
149 | 'id': 'v-cfa6lye0dkdd-sf', | |
150 | 'ext': 'mp4', | |
151 | 'timestamp': 1635337199, | |
152 | 'duration': 133080, | |
153 | 'upload_date': '20211027', | |
154 | 'description': 'md5:e741185fc309310ff5d0c789b437be66', | |
155 | 'title': 'md5:35647293513a6c92363817a0fb0a7961', | |
156 | }, | |
157 | }, { | |
158 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0', | |
159 | 'info_dict': { | |
160 | 'id': 'rl-vcb49w1fb592p', | |
161 | }, | |
162 | 'playlist_count': 100, | |
163 | }, { | |
164 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc', | |
165 | 'info_dict': { | |
166 | 'id': 'cl-bgqaata6aw8x', | |
167 | }, | |
168 | 'playlist_mincount': 2, | |
169 | }] | |
170 | ||
171 | @classmethod | |
172 | def build_player_url(cls, video_id, integration, origin_url=None): | |
173 | query_string = urllib.parse.urlencode({ | |
174 | 'playlistId': video_id, | |
175 | 'integrationId': integration, | |
176 | }) | |
177 | return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) | |
178 | ||
179 | @classmethod | |
180 | def _extract_urls(cls, webpage, origin_url): | |
181 | # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ | |
182 | quot_re = r'["\']' | |
183 | ||
184 | regex = fr'''(?x) | |
185 | <iframe[^>]+?src=(?P<q>{quot_re})(?P<url> | |
186 | (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ | |
187 | )(?P=q)''' | |
188 | for mobj in re.finditer(regex, webpage): | |
189 | url = unescapeHTML(mobj.group('url')) | |
190 | if cls.suitable(url): | |
191 | yield cls._smuggle_origin_url(url, origin_url) | |
192 | ||
193 | regex = fr'''(?x) | |
194 | <glomex-player [^>]+?>| | |
195 | <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>''' | |
196 | for mobj in re.finditer(regex, webpage): | |
197 | attrs = extract_attributes(mobj.group(0)) | |
198 | if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): | |
199 | yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) | |
200 | ||
201 | # naive parsing of inline scripts for hard-coded integration parameters | |
202 | regex = fr'''(?x) | |
203 | (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s* | |
204 | (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s''' | |
205 | for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage): | |
206 | script = mobj.group(0) | |
207 | integration_id = re.search(regex % 'integrationId', script) | |
208 | if not integration_id: | |
209 | continue | |
210 | playlist_id = re.search(regex % 'playlistId', script) | |
211 | if playlist_id: | |
212 | yield cls.build_player_url(playlist_id, integration_id, origin_url) | |
213 | ||
214 | def _real_extract(self, url): | |
215 | url, origin_url = self._unsmuggle_origin_url(url) | |
216 | playlist_id = self._match_id(url) | |
217 | integration = parse_qs(url).get('integrationId', [None])[0] | |
218 | if not integration: | |
219 | raise ExtractorError('No integrationId in URL', expected=True) | |
220 | return self._download_and_extract_api_data(playlist_id, integration, origin_url) |