]>
Commit | Line | Data |
---|---|---|
5f6a1245 | 1 | # coding: utf-8 |
22a6f150 PH |
2 | from __future__ import unicode_literals |
3 | ||
219b8130 | 4 | import re |
b27c856f | 5 | import json |
a3c736de | 6 | import itertools |
219b8130 PH |
7 | |
8 | from .common import InfoExtractor | |
953e32b2 | 9 | |
1cc79574 | 10 | from ..utils import ( |
d3f007af | 11 | determine_ext, |
9b9c5355 | 12 | error_to_compat_str, |
7f8b2714 | 13 | ExtractorError, |
1cc79574 | 14 | int_or_none, |
d3f007af | 15 | parse_iso8601, |
5c2266df | 16 | sanitized_Request, |
f53c966a | 17 | str_to_int, |
4b10aadf | 18 | unescapeHTML, |
ab49d7a9 | 19 | mimetype2ext, |
219b8130 PH |
20 | ) |
21 | ||
5f6a1245 | 22 | |
70922df8 JMF |
23 | class DailymotionBaseInfoExtractor(InfoExtractor): |
24 | @staticmethod | |
25 | def _build_request(url): | |
26 | """Build a request with the family filter disabled""" | |
5c2266df | 27 | request = sanitized_Request(url) |
2a0c2ca2 | 28 | request.add_header('Cookie', 'family_filter=off; ff=off') |
70922df8 | 29 | return request |
953e32b2 | 30 | |
12434026 YCH |
31 | def _download_webpage_handle_no_ff(self, url, *args, **kwargs): |
32 | request = self._build_request(url) | |
33 | return self._download_webpage_handle(request, *args, **kwargs) | |
34 | ||
d3f007af S |
35 | def _download_webpage_no_ff(self, url, *args, **kwargs): |
36 | request = self._build_request(url) | |
37 | return self._download_webpage(request, *args, **kwargs) | |
5f6a1245 | 38 | |
219b8130 | 39 | |
d3f007af | 40 | class DailymotionIE(DailymotionBaseInfoExtractor): |
9fbd0822 | 41 | _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' |
ce6815aa | 42 | IE_NAME = 'dailymotion' |
cdec0190 JMF |
43 | |
44 | _FORMATS = [ | |
ce6815aa PH |
45 | ('stream_h264_ld_url', 'ld'), |
46 | ('stream_h264_url', 'standard'), | |
47 | ('stream_h264_hq_url', 'hq'), | |
48 | ('stream_h264_hd_url', 'hd'), | |
49 | ('stream_h264_hd1080_url', 'hd180'), | |
cdec0190 JMF |
50 | ] |
51 | ||
c5428382 JMF |
52 | _TESTS = [ |
53 | { | |
23ba76bc JMF |
54 | 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', |
55 | 'md5': '2137c41a8e78554bb09225b8eb322406', | |
ce6815aa | 56 | 'info_dict': { |
23ba76bc | 57 | 'id': 'x2iuewm', |
ce6815aa | 58 | 'ext': 'mp4', |
23ba76bc | 59 | 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', |
d3f007af | 60 | 'description': 'Several come bundled with the Steam Controller.', |
ec85ded8 | 61 | 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', |
50aa2bb6 | 62 | 'duration': 74, |
d3f007af S |
63 | 'timestamp': 1425657362, |
64 | 'upload_date': '20150306', | |
65 | 'uploader': 'IGN', | |
66 | 'uploader_id': 'xijv66', | |
67 | 'age_limit': 0, | |
68 | 'view_count': int, | |
69 | 'comment_count': int, | |
c5428382 JMF |
70 | } |
71 | }, | |
72 | # Vevo video | |
73 | { | |
ce6815aa PH |
74 | 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', |
75 | 'info_dict': { | |
76 | 'title': 'Roar (Official)', | |
77 | 'id': 'USUV71301934', | |
78 | 'ext': 'mp4', | |
79 | 'uploader': 'Katy Perry', | |
80 | 'upload_date': '20130905', | |
c5428382 | 81 | }, |
ce6815aa PH |
82 | 'params': { |
83 | 'skip_download': True, | |
c5428382 | 84 | }, |
ce6815aa | 85 | 'skip': 'VEVO is only available in some countries', |
c5428382 | 86 | }, |
9f1109a5 PH |
87 | # age-restricted video |
88 | { | |
ce6815aa PH |
89 | 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', |
90 | 'md5': '0d667a7b9cebecc3c89ee93099c4159d', | |
91 | 'info_dict': { | |
92 | 'id': 'xyh2zz', | |
93 | 'ext': 'mp4', | |
94 | 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', | |
95 | 'uploader': 'HotWaves1012', | |
96 | 'age_limit': 18, | |
9feb1c97 YCH |
97 | }, |
98 | 'skip': 'video gone', | |
583882fd LL |
99 | }, |
100 | # geo-restricted, player v5 | |
101 | { | |
102 | 'url': 'http://www.dailymotion.com/video/xhza0o', | |
103 | 'only_matching': True, | |
a8abf124 S |
104 | }, |
105 | # with subtitles | |
106 | { | |
107 | 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', | |
108 | 'only_matching': True, | |
9fbd0822 S |
109 | }, |
110 | { | |
111 | 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', | |
112 | 'only_matching': True, | |
9f1109a5 | 113 | } |
c5428382 | 114 | ] |
219b8130 | 115 | |
ad213a1d YCH |
116 | @staticmethod |
117 | def _extract_urls(webpage): | |
118 | # Look for embedded Dailymotion player | |
119 | matches = re.findall( | |
120 | r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) | |
121 | return list(map(lambda m: unescapeHTML(m[1]), matches)) | |
122 | ||
219b8130 | 123 | def _real_extract(self, url): |
b10609d9 | 124 | video_id = self._match_id(url) |
219b8130 | 125 | |
d3f007af S |
126 | webpage = self._download_webpage_no_ff( |
127 | 'https://www.dailymotion.com/video/%s' % video_id, video_id) | |
128 | ||
129 | age_limit = self._rta_search(webpage) | |
130 | ||
131 | description = self._og_search_description(webpage) or self._html_search_meta( | |
132 | 'description', webpage, 'description') | |
219b8130 | 133 | |
411cb8f4 | 134 | view_count_str = self._search_regex( |
135 | (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', | |
136 | r'video_views_count[^>]+>\s+([\s\d\,.]+)'), | |
137 | webpage, 'view count', fatal=False) | |
138 | if view_count_str: | |
139 | view_count_str = re.sub(r'\s', '', view_count_str) | |
140 | view_count = str_to_int(view_count_str) | |
d3f007af S |
141 | comment_count = int_or_none(self._search_regex( |
142 | r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', | |
143 | webpage, 'comment count', fatal=False)) | |
144 | ||
145 | player_v5 = self._search_regex( | |
0b534d2a S |
146 | [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 |
147 | r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', | |
9feb1c97 YCH |
148 | r'buildPlayer\(({.+?})\);', |
149 | r'var\s+config\s*=\s*({.+?});'], | |
d3f007af S |
150 | webpage, 'player v5', default=None) |
151 | if player_v5: | |
152 | player = self._parse_json(player_v5, video_id) | |
153 | metadata = player['metadata'] | |
583882fd LL |
154 | |
155 | self._check_error(metadata) | |
156 | ||
d3f007af S |
157 | formats = [] |
158 | for quality, media_list in metadata['qualities'].items(): | |
159 | for media in media_list: | |
160 | media_url = media.get('url') | |
161 | if not media_url: | |
162 | continue | |
163 | type_ = media.get('type') | |
164 | if type_ == 'application/vnd.lumberjack.manifest': | |
165 | continue | |
ab49d7a9 RA |
166 | ext = mimetype2ext(type_) or determine_ext(media_url) |
167 | if ext == 'm3u8': | |
7e5edcfd | 168 | formats.extend(self._extract_m3u8_formats( |
16a34847 S |
169 | media_url, video_id, 'mp4', preference=-1, |
170 | m3u8_id='hls', fatal=False)) | |
ab49d7a9 | 171 | elif ext == 'f4m': |
7e5edcfd S |
172 | formats.extend(self._extract_f4m_formats( |
173 | media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) | |
d3f007af S |
174 | else: |
175 | f = { | |
176 | 'url': media_url, | |
16a34847 | 177 | 'format_id': 'http-%s' % quality, |
ab49d7a9 | 178 | 'ext': ext, |
d3f007af S |
179 | } |
180 | m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) | |
181 | if m: | |
182 | f.update({ | |
183 | 'width': int(m.group('width')), | |
184 | 'height': int(m.group('height')), | |
185 | }) | |
186 | formats.append(f) | |
187 | self._sort_formats(formats) | |
188 | ||
189 | title = metadata['title'] | |
190 | duration = int_or_none(metadata.get('duration')) | |
191 | timestamp = int_or_none(metadata.get('created_time')) | |
192 | thumbnail = metadata.get('poster_url') | |
193 | uploader = metadata.get('owner', {}).get('screenname') | |
194 | uploader_id = metadata.get('owner', {}).get('id') | |
195 | ||
196 | subtitles = {} | |
cbd2ffd0 S |
197 | subtitles_data = metadata.get('subtitles', {}).get('data', {}) |
198 | if subtitles_data and isinstance(subtitles_data, dict): | |
199 | for subtitle_lang, subtitle in subtitles_data.items(): | |
200 | subtitles[subtitle_lang] = [{ | |
201 | 'ext': determine_ext(subtitle_url), | |
202 | 'url': subtitle_url, | |
203 | } for subtitle_url in subtitle.get('urls', [])] | |
d3f007af S |
204 | |
205 | return { | |
206 | 'id': video_id, | |
207 | 'title': title, | |
208 | 'description': description, | |
209 | 'thumbnail': thumbnail, | |
210 | 'duration': duration, | |
211 | 'timestamp': timestamp, | |
212 | 'uploader': uploader, | |
213 | 'uploader_id': uploader_id, | |
214 | 'age_limit': age_limit, | |
215 | 'view_count': view_count, | |
216 | 'comment_count': comment_count, | |
217 | 'formats': formats, | |
218 | 'subtitles': subtitles, | |
219 | } | |
219b8130 | 220 | |
d3f007af S |
221 | # vevo embed |
222 | vevo_id = self._search_regex( | |
9e05d039 | 223 | r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)', |
d3f007af S |
224 | webpage, 'vevo embed', default=None) |
225 | if vevo_id: | |
226 | return self.url_result('vevo:%s' % vevo_id, 'Vevo') | |
c5428382 | 227 | |
d3f007af S |
228 | # fallback old player |
229 | embed_page = self._download_webpage_no_ff( | |
230 | 'https://www.dailymotion.com/embed/video/%s' % video_id, | |
231 | video_id, 'Downloading embed page') | |
232 | ||
233 | timestamp = parse_iso8601(self._html_search_meta( | |
234 | 'video:release_date', webpage, 'upload date')) | |
235 | ||
236 | info = self._parse_json( | |
237 | self._search_regex( | |
238 | r'var info = ({.*?}),$', embed_page, | |
239 | 'video info', flags=re.MULTILINE), | |
240 | video_id) | |
219b8130 | 241 | |
583882fd | 242 | self._check_error(info) |
b27c856f | 243 | |
cdec0190 JMF |
244 | formats = [] |
245 | for (key, format_id) in self._FORMATS: | |
246 | video_url = info.get(key) | |
247 | if video_url is not None: | |
248 | m_size = re.search(r'H264-(\d+)x(\d+)', video_url) | |
249 | if m_size is not None: | |
553f6e46 | 250 | width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) |
cdec0190 JMF |
251 | else: |
252 | width, height = None, None | |
253 | formats.append({ | |
254 | 'url': video_url, | |
255 | 'ext': 'mp4', | |
256 | 'format_id': format_id, | |
257 | 'width': width, | |
258 | 'height': height, | |
259 | }) | |
d3f007af | 260 | self._sort_formats(formats) |
b27c856f | 261 | |
953e32b2 | 262 | # subtitles |
1f343eaa | 263 | video_subtitles = self.extract_subtitles(video_id, webpage) |
953e32b2 | 264 | |
b10609d9 PH |
265 | title = self._og_search_title(webpage, default=None) |
266 | if title is None: | |
267 | title = self._html_search_regex( | |
268 | r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, | |
269 | 'title') | |
f53c966a | 270 | |
9f1109a5 | 271 | return { |
b10609d9 | 272 | 'id': video_id, |
cdec0190 | 273 | 'formats': formats, |
85342674 | 274 | 'uploader': info['owner.screenname'], |
d3f007af | 275 | 'timestamp': timestamp, |
b10609d9 | 276 | 'title': title, |
d3f007af | 277 | 'description': description, |
b10609d9 | 278 | 'subtitles': video_subtitles, |
9f1109a5 PH |
279 | 'thumbnail': info['thumbnail_url'], |
280 | 'age_limit': age_limit, | |
f53c966a | 281 | 'view_count': view_count, |
50aa2bb6 | 282 | 'duration': info['duration'] |
9f1109a5 | 283 | } |
a3c736de | 284 | |
583882fd LL |
285 | def _check_error(self, info): |
286 | if info.get('error') is not None: | |
7593fbaa S |
287 | raise ExtractorError( |
288 | '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) | |
583882fd | 289 | |
a1f2a06b | 290 | def _get_subtitles(self, video_id, webpage): |
f8e52269 | 291 | try: |
7fad1c63 JMF |
292 | sub_list = self._download_webpage( |
293 | 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, | |
294 | video_id, note=False) | |
295 | except ExtractorError as err: | |
9b9c5355 | 296 | self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) |
f8e52269 IM |
297 | return {} |
298 | info = json.loads(sub_list) | |
299 | if (info['total'] > 0): | |
a1f2a06b | 300 | sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) |
f8e52269 | 301 | return sub_lang_list |
22a6f150 | 302 | self._downloader.report_warning('video doesn\'t have subtitles') |
f8e52269 IM |
303 | return {} |
304 | ||
a3c736de | 305 | |
70922df8 | 306 | class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): |
22a6f150 | 307 | IE_NAME = 'dailymotion:playlist' |
a3c736de | 308 | _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' |
1e0a235f | 309 | _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' |
39baacc4 | 310 | _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' |
22a6f150 PH |
311 | _TESTS = [{ |
312 | 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', | |
313 | 'info_dict': { | |
314 | 'title': 'SPORT', | |
11e611a7 | 315 | 'id': 'xv4bw_nqtv_sport', |
22a6f150 PH |
316 | }, |
317 | 'playlist_mincount': 20, | |
318 | }] | |
a3c736de | 319 | |
39baacc4 | 320 | def _extract_entries(self, id): |
25a4c5a9 | 321 | video_ids = set() |
12434026 | 322 | processed_urls = set() |
a3c736de | 323 | for pagenum in itertools.count(1): |
12434026 YCH |
324 | page_url = self._PAGE_TEMPLATE % (id, pagenum) |
325 | webpage, urlh = self._download_webpage_handle_no_ff( | |
326 | page_url, id, 'Downloading page %s' % pagenum) | |
327 | if urlh.geturl() in processed_urls: | |
328 | self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( | |
329 | page_url, urlh.geturl()), id) | |
330 | break | |
331 | ||
332 | processed_urls.add(urlh.geturl()) | |
a3c736de | 333 | |
25a4c5a9 JMF |
334 | for video_id in re.findall(r'data-xid="(.+?)"', webpage): |
335 | if video_id not in video_ids: | |
fa9f1d16 S |
336 | yield self.url_result( |
337 | 'http://www.dailymotion.com/video/%s' % video_id, | |
338 | DailymotionIE.ie_key(), video_id) | |
25a4c5a9 | 339 | video_ids.add(video_id) |
a3c736de | 340 | |
1e0a235f | 341 | if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: |
a3c736de | 342 | break |
39baacc4 JMF |
343 | |
344 | def _real_extract(self, url): | |
345 | mobj = re.match(self._VALID_URL, url) | |
346 | playlist_id = mobj.group('id') | |
347 | webpage = self._download_webpage(url, playlist_id) | |
348 | ||
b0fb63ab PH |
349 | return { |
350 | '_type': 'playlist', | |
351 | 'id': playlist_id, | |
352 | 'title': self._og_search_title(webpage), | |
353 | 'entries': self._extract_entries(playlist_id), | |
354 | } | |
39baacc4 JMF |
355 | |
356 | ||
357 | class DailymotionUserIE(DailymotionPlaylistIE): | |
22a6f150 | 358 | IE_NAME = 'dailymotion:user' |
9fbd0822 | 359 | _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' |
39baacc4 | 360 | _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' |
22a6f150 PH |
361 | _TESTS = [{ |
362 | 'url': 'https://www.dailymotion.com/user/nqtv', | |
363 | 'info_dict': { | |
364 | 'id': 'nqtv', | |
365 | 'title': 'Rémi Gaillard', | |
366 | }, | |
367 | 'playlist_mincount': 100, | |
12434026 YCH |
368 | }, { |
369 | 'url': 'http://www.dailymotion.com/user/UnderProject', | |
370 | 'info_dict': { | |
371 | 'id': 'UnderProject', | |
372 | 'title': 'UnderProject', | |
373 | }, | |
374 | 'playlist_mincount': 1800, | |
375 | 'expected_warnings': [ | |
376 | 'Stopped at duplicated page', | |
377 | ], | |
378 | 'skip': 'Takes too long time', | |
22a6f150 | 379 | }] |
39baacc4 JMF |
380 | |
381 | def _real_extract(self, url): | |
382 | mobj = re.match(self._VALID_URL, url) | |
383 | user = mobj.group('user') | |
7d65242d S |
384 | webpage = self._download_webpage( |
385 | 'https://www.dailymotion.com/user/%s' % user, user) | |
4b10aadf S |
386 | full_user = unescapeHTML(self._html_search_regex( |
387 | r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), | |
22a6f150 | 388 | webpage, 'user')) |
39baacc4 JMF |
389 | |
390 | return { | |
391 | '_type': 'playlist', | |
392 | 'id': user, | |
393 | 'title': full_user, | |
394 | 'entries': self._extract_entries(user), | |
395 | } | |
756f574e YCH |
396 | |
397 | ||
398 | class DailymotionCloudIE(DailymotionBaseInfoExtractor): | |
89f257d6 | 399 | _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' |
0bcdc276 S |
400 | _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX |
401 | _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX | |
756f574e | 402 | |
0bcdc276 | 403 | _TESTS = [{ |
756f574e YCH |
404 | # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html |
405 | # Tested at FranceTvInfo_2 | |
406 | 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', | |
407 | 'only_matching': True, | |
0bcdc276 S |
408 | }, { |
409 | # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html | |
410 | 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', | |
411 | 'only_matching': True, | |
412 | }] | |
756f574e YCH |
413 | |
414 | @classmethod | |
fca2e6d5 S |
415 | def _extract_dmcloud_url(cls, webpage): |
416 | mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, webpage) | |
756f574e YCH |
417 | if mobj: |
418 | return mobj.group(1) | |
419 | ||
0bcdc276 | 420 | mobj = re.search( |
fca2e6d5 | 421 | r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % cls._VALID_EMBED_URL, |
0bcdc276 | 422 | webpage) |
756f574e YCH |
423 | if mobj: |
424 | return mobj.group(1) | |
425 | ||
426 | def _real_extract(self, url): | |
427 | video_id = self._match_id(url) | |
428 | ||
d3f007af | 429 | webpage = self._download_webpage_no_ff(url, video_id) |
756f574e YCH |
430 | |
431 | title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') | |
432 | ||
433 | video_info = self._parse_json(self._search_regex( | |
434 | r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) | |
435 | ||
436 | # TODO: parse ios_url, which is in fact a manifest | |
437 | video_url = video_info['mp4_url'] | |
438 | ||
439 | return { | |
440 | 'id': video_id, | |
441 | 'url': video_url, | |
442 | 'title': title, | |
443 | 'thumbnail': video_info.get('thumbnail_url'), | |
444 | } |