2 from __future__
import unicode_literals
7 from .common
import InfoExtractor
19 class GlomexBaseIE(InfoExtractor
):
20 _DEFAULT_ORIGIN_URL
= 'https://player.glomex.com/'
21 _API_URL
= 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
24 def _smuggle_origin_url(url
, origin_url
):
25 if origin_url
is None:
27 return smuggle_url(url
, {'origin': origin_url}
)
30 def _unsmuggle_origin_url(cls
, url
, fallback_origin_url
=None):
31 defaults
= {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
32 unsmuggled_url
, data
= unsmuggle_url(url
, default
=defaults
)
33 return unsmuggled_url
, data
['origin']
35 def _get_videoid_type(self
, video_id
):
39 'rl': 'related videos playlist',
40 'cl': 'curated playlist',
42 prefix
= video_id
.split('-')[0]
43 return _VIDEOID_TYPES
.get(prefix
, 'unknown type')
45 def _download_api_data(self
, video_id
, integration
, current_url
=None):
47 'integration_id': integration
,
48 'playlist_id': video_id
,
49 'current_url': current_url
or self
._DEFAULT
_ORIGIN
_URL
,
51 video_id_type
= self
._get
_videoid
_type
(video_id
)
52 return self
._download
_json
(
54 video_id
, 'Downloading %s JSON' % video_id_type
,
55 'Unable to download %s JSON' % video_id_type
,
58 def _download_and_extract_api_data(self
, video_id
, integration
, current_url
):
59 api_data
= self
._download
_api
_data
(video_id
, integration
, current_url
)
60 videos
= api_data
['videos']
62 raise ExtractorError('no videos found for %s' % video_id
)
63 videos
= [self
._extract
_api
_data
(video
, video_id
) for video
in videos
]
64 return videos
[0] if len(videos
) == 1 else self
.playlist_result(videos
, video_id
)
66 def _extract_api_data(self
, video
, video_id
):
67 if video
.get('error_code') == 'contentGeoblocked':
68 self
.raise_geo_restricted(countries
=video
['geo_locations'])
70 formats
, subs
= [], {}
71 for format_id
, format_url
in video
['source'].items():
72 ext
= determine_ext(format_url
)
74 formats_
, subs_
= self
._extract
_m
3u8_formats
_and
_subtitles
(
75 format_url
, video_id
, 'mp4', m3u8_id
=format_id
,
77 formats
.extend(formats_
)
78 self
._merge
_subtitles
(subs_
, target
=subs
)
82 'format_id': format_id
,
84 if video
.get('language'):
86 fmt
['language'] = video
['language']
87 self
._sort
_formats
(formats
)
89 images
= (video
.get('images') or []) + [video
.get('image') or {}]
91 'id': image
.get('id'),
92 'url': f
'{image["url"]}/profile:player-960x540',
95 } for image
in images
if image
.get('url')]
96 self
._remove
_duplicate
_formats
(thumbnails
)
99 'id': video
.get('clip_id') or video_id
,
100 'title': video
.get('title'),
101 'description': video
.get('description'),
102 'thumbnails': thumbnails
,
103 'duration': int_or_none(video
.get('clip_duration')),
104 'timestamp': video
.get('created_at'),
110 class GlomexIE(GlomexBaseIE
):
112 IE_DESC
= 'Glomex videos'
113 _VALID_URL
= r
'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
114 _INTEGRATION_ID
= '19syy24xjn1oqlpc'
117 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
118 'md5': 'cec33a943c4240c9cb33abea8c26242e',
120 'id': 'v-cb24uwg77hgh',
122 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
123 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
125 'timestamp': 1619895017,
126 'upload_date': '20210501',
130 def _real_extract(self
, url
):
131 video_id
= self
._match
_id
(url
)
132 return self
.url_result(
133 GlomexEmbedIE
.build_player_url(video_id
, self
._INTEGRATION
_ID
, url
),
134 GlomexEmbedIE
.ie_key(), video_id
)
137 class GlomexEmbedIE(GlomexBaseIE
):
138 IE_NAME
= 'glomex:embed'
139 IE_DESC
= 'Glomex embedded videos'
140 _BASE_PLAYER_URL
= '//player.glomex.com/integration/1/iframe-player.html'
141 _BASE_PLAYER_URL_RE
= re
.escape(_BASE_PLAYER_URL
).replace('/1/', r
'/[^/]/')
142 _VALID_URL
= rf
'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
145 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
146 'md5': '68f259b98cc01918ac34180142fce287',
148 'id': 'v-cfa6lye0dkdd-sf',
150 'timestamp': 1635337199,
152 'upload_date': '20211027',
153 'description': 'md5:e741185fc309310ff5d0c789b437be66',
154 'title': 'md5:35647293513a6c92363817a0fb0a7961',
157 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
159 'id': 'rl-vcb49w1fb592p',
161 'playlist_count': 100,
163 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
165 'id': 'cl-bgqaata6aw8x',
167 'playlist_mincount': 2,
171 def build_player_url(cls
, video_id
, integration
, origin_url
=None):
172 query_string
= urllib
.parse
.urlencode({
173 'playlistId': video_id
,
174 'integrationId': integration
,
176 return cls
._smuggle
_origin
_url
(f
'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url
)
179 def _extract_urls(cls
, webpage
, origin_url
):
180 VALID_SRC
= rf
'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
182 # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
183 EMBED_RE
= r
'''(?x)(?:
184 <iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)|
185 <(?P<html_tag>glomex-player|div)(?:
186 data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
187 data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
188 data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)|
191 # naive parsing of inline scripts for hard-coded integration parameters
192 <(?P<script_tag>script)[^<]*?>(?:
193 (?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s*
194 (?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?|
195 (?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s*
196 (?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?|
199 )''' % {'quot_re': r'["\']', 'url_re': VALID_SRC}
201 for mobj
in re
.finditer(EMBED_RE
, webpage
):
202 mdict
= mobj
.groupdict()
204 url
= unescapeHTML(mdict
['url'])
205 if not cls
.suitable(url
):
207 yield cls
._smuggle
_origin
_url
(url
, origin_url
)
208 elif mdict
.get('html_tag'):
209 if mdict
['html_tag'] == 'div' and not mdict
.get('glomex_player'):
211 if not mdict
.get('video_id_html') or not mdict
.get('integration_html'):
213 yield cls
.build_player_url(mdict
['video_id_html'], mdict
['integration_html'], origin_url
)
214 elif mdict
.get('script_tag'):
215 if not mdict
.get('video_id_js') or not mdict
.get('integration_js'):
217 yield cls
.build_player_url(mdict
['video_id_js'], mdict
['integration_js'], origin_url
)
219 def _real_extract(self
, url
):
220 url
, origin_url
= self
._unsmuggle
_origin
_url
(url
)
221 playlist_id
= self
._match
_id
(url
)
222 integration
= parse_qs(url
).get('integrationId', [None])[0]
224 raise ExtractorError('No integrationId in URL', expected
=True)
225 return self
._download
_and
_extract
_api
_data
(playlist_id
, integration
, origin_url
)