]>
Commit | Line | Data |
---|---|---|
71738b14 ZM |
1 | import re |
2 | import urllib.parse | |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..utils import ( | |
6 | determine_ext, | |
5e51f4a8 | 7 | extract_attributes, |
71738b14 ZM |
8 | ExtractorError, |
9 | int_or_none, | |
10 | parse_qs, | |
11 | smuggle_url, | |
12 | unescapeHTML, | |
13 | unsmuggle_url, | |
14 | ) | |
15 | ||
16 | ||
17 | class GlomexBaseIE(InfoExtractor): | |
18 | _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/' | |
19 | _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/' | |
20 | ||
21 | @staticmethod | |
22 | def _smuggle_origin_url(url, origin_url): | |
23 | if origin_url is None: | |
24 | return url | |
25 | return smuggle_url(url, {'origin': origin_url}) | |
26 | ||
27 | @classmethod | |
28 | def _unsmuggle_origin_url(cls, url, fallback_origin_url=None): | |
29 | defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL} | |
30 | unsmuggled_url, data = unsmuggle_url(url, default=defaults) | |
31 | return unsmuggled_url, data['origin'] | |
32 | ||
33 | def _get_videoid_type(self, video_id): | |
34 | _VIDEOID_TYPES = { | |
35 | 'v': 'video', | |
36 | 'pl': 'playlist', | |
37 | 'rl': 'related videos playlist', | |
38 | 'cl': 'curated playlist', | |
39 | } | |
40 | prefix = video_id.split('-')[0] | |
41 | return _VIDEOID_TYPES.get(prefix, 'unknown type') | |
42 | ||
43 | def _download_api_data(self, video_id, integration, current_url=None): | |
44 | query = { | |
45 | 'integration_id': integration, | |
46 | 'playlist_id': video_id, | |
47 | 'current_url': current_url or self._DEFAULT_ORIGIN_URL, | |
48 | } | |
49 | video_id_type = self._get_videoid_type(video_id) | |
50 | return self._download_json( | |
51 | self._API_URL, | |
52 | video_id, 'Downloading %s JSON' % video_id_type, | |
53 | 'Unable to download %s JSON' % video_id_type, | |
54 | query=query) | |
55 | ||
56 | def _download_and_extract_api_data(self, video_id, integration, current_url): | |
57 | api_data = self._download_api_data(video_id, integration, current_url) | |
58 | videos = api_data['videos'] | |
59 | if not videos: | |
60 | raise ExtractorError('no videos found for %s' % video_id) | |
61 | videos = [self._extract_api_data(video, video_id) for video in videos] | |
62 | return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id) | |
63 | ||
64 | def _extract_api_data(self, video, video_id): | |
65 | if video.get('error_code') == 'contentGeoblocked': | |
66 | self.raise_geo_restricted(countries=video['geo_locations']) | |
67 | ||
68 | formats, subs = [], {} | |
69 | for format_id, format_url in video['source'].items(): | |
70 | ext = determine_ext(format_url) | |
71 | if ext == 'm3u8': | |
72 | formats_, subs_ = self._extract_m3u8_formats_and_subtitles( | |
73 | format_url, video_id, 'mp4', m3u8_id=format_id, | |
74 | fatal=False) | |
75 | formats.extend(formats_) | |
fdf80059 | 76 | self._merge_subtitles(subs_, target=subs) |
71738b14 ZM |
77 | else: |
78 | formats.append({ | |
79 | 'url': format_url, | |
80 | 'format_id': format_id, | |
81 | }) | |
82 | if video.get('language'): | |
83 | for fmt in formats: | |
84 | fmt['language'] = video['language'] | |
71738b14 ZM |
85 | |
86 | images = (video.get('images') or []) + [video.get('image') or {}] | |
87 | thumbnails = [{ | |
88 | 'id': image.get('id'), | |
89 | 'url': f'{image["url"]}/profile:player-960x540', | |
90 | 'width': 960, | |
91 | 'height': 540, | |
92 | } for image in images if image.get('url')] | |
93 | self._remove_duplicate_formats(thumbnails) | |
94 | ||
95 | return { | |
96 | 'id': video.get('clip_id') or video_id, | |
97 | 'title': video.get('title'), | |
98 | 'description': video.get('description'), | |
99 | 'thumbnails': thumbnails, | |
100 | 'duration': int_or_none(video.get('clip_duration')), | |
101 | 'timestamp': video.get('created_at'), | |
102 | 'formats': formats, | |
103 | 'subtitles': subs, | |
104 | } | |
105 | ||
106 | ||
107 | class GlomexIE(GlomexBaseIE): | |
108 | IE_NAME = 'glomex' | |
109 | IE_DESC = 'Glomex videos' | |
110 | _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)' | |
111 | _INTEGRATION_ID = '19syy24xjn1oqlpc' | |
112 | ||
113 | _TESTS = [{ | |
114 | 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel', | |
115 | 'md5': 'cec33a943c4240c9cb33abea8c26242e', | |
116 | 'info_dict': { | |
117 | 'id': 'v-cb24uwg77hgh', | |
118 | 'ext': 'mp4', | |
119 | 'title': 'md5:38a90cedcfadd72982c81acf13556e0c', | |
120 | 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8', | |
121 | 'duration': 29600, | |
122 | 'timestamp': 1619895017, | |
123 | 'upload_date': '20210501', | |
124 | }, | |
125 | }] | |
126 | ||
127 | def _real_extract(self, url): | |
128 | video_id = self._match_id(url) | |
129 | return self.url_result( | |
130 | GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url), | |
131 | GlomexEmbedIE.ie_key(), video_id) | |
132 | ||
133 | ||
134 | class GlomexEmbedIE(GlomexBaseIE): | |
135 | IE_NAME = 'glomex:embed' | |
136 | IE_DESC = 'Glomex embedded videos' | |
137 | _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html' | |
138 | _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/') | |
139 | _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)' | |
140 | ||
141 | _TESTS = [{ | |
142 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf', | |
143 | 'md5': '68f259b98cc01918ac34180142fce287', | |
144 | 'info_dict': { | |
145 | 'id': 'v-cfa6lye0dkdd-sf', | |
146 | 'ext': 'mp4', | |
147 | 'timestamp': 1635337199, | |
148 | 'duration': 133080, | |
149 | 'upload_date': '20211027', | |
150 | 'description': 'md5:e741185fc309310ff5d0c789b437be66', | |
151 | 'title': 'md5:35647293513a6c92363817a0fb0a7961', | |
152 | }, | |
153 | }, { | |
154 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0', | |
155 | 'info_dict': { | |
156 | 'id': 'rl-vcb49w1fb592p', | |
157 | }, | |
158 | 'playlist_count': 100, | |
159 | }, { | |
160 | 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc', | |
161 | 'info_dict': { | |
162 | 'id': 'cl-bgqaata6aw8x', | |
163 | }, | |
164 | 'playlist_mincount': 2, | |
165 | }] | |
166 | ||
167 | @classmethod | |
168 | def build_player_url(cls, video_id, integration, origin_url=None): | |
169 | query_string = urllib.parse.urlencode({ | |
170 | 'playlistId': video_id, | |
171 | 'integrationId': integration, | |
172 | }) | |
173 | return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) | |
174 | ||
175 | @classmethod | |
bfd973ec | 176 | def _extract_embed_urls(cls, url, webpage): |
71738b14 | 177 | # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ |
5e51f4a8 | 178 | quot_re = r'["\']' |
179 | ||
180 | regex = fr'''(?x) | |
181 | <iframe[^>]+?src=(?P<q>{quot_re})(?P<url> | |
182 | (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ | |
183 | )(?P=q)''' | |
184 | for mobj in re.finditer(regex, webpage): | |
bfd973ec | 185 | embed_url = unescapeHTML(mobj.group('url')) |
186 | if cls.suitable(embed_url): | |
187 | yield cls._smuggle_origin_url(embed_url, url) | |
5e51f4a8 | 188 | |
189 | regex = fr'''(?x) | |
190 | <glomex-player [^>]+?>| | |
191 | <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>''' | |
192 | for mobj in re.finditer(regex, webpage): | |
193 | attrs = extract_attributes(mobj.group(0)) | |
194 | if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): | |
bfd973ec | 195 | yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url) |
5e51f4a8 | 196 | |
197 | # naive parsing of inline scripts for hard-coded integration parameters | |
198 | regex = fr'''(?x) | |
199 | (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s* | |
200 | (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s''' | |
201 | for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage): | |
202 | script = mobj.group(0) | |
203 | integration_id = re.search(regex % 'integrationId', script) | |
204 | if not integration_id: | |
205 | continue | |
206 | playlist_id = re.search(regex % 'playlistId', script) | |
207 | if playlist_id: | |
bfd973ec | 208 | yield cls.build_player_url(playlist_id, integration_id, url) |
71738b14 ZM |
209 | |
210 | def _real_extract(self, url): | |
211 | url, origin_url = self._unsmuggle_origin_url(url) | |
212 | playlist_id = self._match_id(url) | |
213 | integration = parse_qs(url).get('integrationId', [None])[0] | |
214 | if not integration: | |
215 | raise ExtractorError('No integrationId in URL', expected=True) | |
216 | return self._download_and_extract_api_data(playlist_id, integration, origin_url) |