]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/glomex.py
[EmbedThumbnail] Do not remove id3v1 tags
[yt-dlp.git] / yt_dlp / extractor / glomex.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5 import urllib.parse
6
7 from .common import InfoExtractor
8 from ..utils import (
9 determine_ext,
10 extract_attributes,
11 ExtractorError,
12 int_or_none,
13 parse_qs,
14 smuggle_url,
15 unescapeHTML,
16 unsmuggle_url,
17 )
18
19
20 class GlomexBaseIE(InfoExtractor):
21 _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
22 _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
23
24 @staticmethod
25 def _smuggle_origin_url(url, origin_url):
26 if origin_url is None:
27 return url
28 return smuggle_url(url, {'origin': origin_url})
29
30 @classmethod
31 def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
32 defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
33 unsmuggled_url, data = unsmuggle_url(url, default=defaults)
34 return unsmuggled_url, data['origin']
35
36 def _get_videoid_type(self, video_id):
37 _VIDEOID_TYPES = {
38 'v': 'video',
39 'pl': 'playlist',
40 'rl': 'related videos playlist',
41 'cl': 'curated playlist',
42 }
43 prefix = video_id.split('-')[0]
44 return _VIDEOID_TYPES.get(prefix, 'unknown type')
45
46 def _download_api_data(self, video_id, integration, current_url=None):
47 query = {
48 'integration_id': integration,
49 'playlist_id': video_id,
50 'current_url': current_url or self._DEFAULT_ORIGIN_URL,
51 }
52 video_id_type = self._get_videoid_type(video_id)
53 return self._download_json(
54 self._API_URL,
55 video_id, 'Downloading %s JSON' % video_id_type,
56 'Unable to download %s JSON' % video_id_type,
57 query=query)
58
59 def _download_and_extract_api_data(self, video_id, integration, current_url):
60 api_data = self._download_api_data(video_id, integration, current_url)
61 videos = api_data['videos']
62 if not videos:
63 raise ExtractorError('no videos found for %s' % video_id)
64 videos = [self._extract_api_data(video, video_id) for video in videos]
65 return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
66
67 def _extract_api_data(self, video, video_id):
68 if video.get('error_code') == 'contentGeoblocked':
69 self.raise_geo_restricted(countries=video['geo_locations'])
70
71 formats, subs = [], {}
72 for format_id, format_url in video['source'].items():
73 ext = determine_ext(format_url)
74 if ext == 'm3u8':
75 formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
76 format_url, video_id, 'mp4', m3u8_id=format_id,
77 fatal=False)
78 formats.extend(formats_)
79 self._merge_subtitles(subs_, target=subs)
80 else:
81 formats.append({
82 'url': format_url,
83 'format_id': format_id,
84 })
85 if video.get('language'):
86 for fmt in formats:
87 fmt['language'] = video['language']
88 self._sort_formats(formats)
89
90 images = (video.get('images') or []) + [video.get('image') or {}]
91 thumbnails = [{
92 'id': image.get('id'),
93 'url': f'{image["url"]}/profile:player-960x540',
94 'width': 960,
95 'height': 540,
96 } for image in images if image.get('url')]
97 self._remove_duplicate_formats(thumbnails)
98
99 return {
100 'id': video.get('clip_id') or video_id,
101 'title': video.get('title'),
102 'description': video.get('description'),
103 'thumbnails': thumbnails,
104 'duration': int_or_none(video.get('clip_duration')),
105 'timestamp': video.get('created_at'),
106 'formats': formats,
107 'subtitles': subs,
108 }
109
110
111 class GlomexIE(GlomexBaseIE):
112 IE_NAME = 'glomex'
113 IE_DESC = 'Glomex videos'
114 _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
115 _INTEGRATION_ID = '19syy24xjn1oqlpc'
116
117 _TESTS = [{
118 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
119 'md5': 'cec33a943c4240c9cb33abea8c26242e',
120 'info_dict': {
121 'id': 'v-cb24uwg77hgh',
122 'ext': 'mp4',
123 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
124 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
125 'duration': 29600,
126 'timestamp': 1619895017,
127 'upload_date': '20210501',
128 },
129 }]
130
131 def _real_extract(self, url):
132 video_id = self._match_id(url)
133 return self.url_result(
134 GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url),
135 GlomexEmbedIE.ie_key(), video_id)
136
137
138 class GlomexEmbedIE(GlomexBaseIE):
139 IE_NAME = 'glomex:embed'
140 IE_DESC = 'Glomex embedded videos'
141 _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html'
142 _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/')
143 _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
144
145 _TESTS = [{
146 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
147 'md5': '68f259b98cc01918ac34180142fce287',
148 'info_dict': {
149 'id': 'v-cfa6lye0dkdd-sf',
150 'ext': 'mp4',
151 'timestamp': 1635337199,
152 'duration': 133080,
153 'upload_date': '20211027',
154 'description': 'md5:e741185fc309310ff5d0c789b437be66',
155 'title': 'md5:35647293513a6c92363817a0fb0a7961',
156 },
157 }, {
158 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
159 'info_dict': {
160 'id': 'rl-vcb49w1fb592p',
161 },
162 'playlist_count': 100,
163 }, {
164 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
165 'info_dict': {
166 'id': 'cl-bgqaata6aw8x',
167 },
168 'playlist_mincount': 2,
169 }]
170
171 @classmethod
172 def build_player_url(cls, video_id, integration, origin_url=None):
173 query_string = urllib.parse.urlencode({
174 'playlistId': video_id,
175 'integrationId': integration,
176 })
177 return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
178
179 @classmethod
180 def _extract_urls(cls, webpage, origin_url):
181 # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
182 quot_re = r'["\']'
183
184 regex = fr'''(?x)
185 <iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
186 (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
187 )(?P=q)'''
188 for mobj in re.finditer(regex, webpage):
189 url = unescapeHTML(mobj.group('url'))
190 if cls.suitable(url):
191 yield cls._smuggle_origin_url(url, origin_url)
192
193 regex = fr'''(?x)
194 <glomex-player [^>]+?>|
195 <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
196 for mobj in re.finditer(regex, webpage):
197 attrs = extract_attributes(mobj.group(0))
198 if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
199 yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url)
200
201 # naive parsing of inline scripts for hard-coded integration parameters
202 regex = fr'''(?x)
203 (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
204 (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
205 for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
206 script = mobj.group(0)
207 integration_id = re.search(regex % 'integrationId', script)
208 if not integration_id:
209 continue
210 playlist_id = re.search(regex % 'playlistId', script)
211 if playlist_id:
212 yield cls.build_player_url(playlist_id, integration_id, origin_url)
213
214 def _real_extract(self, url):
215 url, origin_url = self._unsmuggle_origin_url(url)
216 playlist_id = self._match_id(url)
217 integration = parse_qs(url).get('integrationId', [None])[0]
218 if not integration:
219 raise ExtractorError('No integrationId in URL', expected=True)
220 return self._download_and_extract_api_data(playlist_id, integration, origin_url)