]>
Commit | Line | Data |
---|---|---|
5f6a1245 | 1 | # coding: utf-8 |
22a6f150 PH |
2 | from __future__ import unicode_literals |
3 | ||
219b8130 | 4 | import re |
b27c856f | 5 | import json |
a3c736de | 6 | import itertools |
219b8130 PH |
7 | |
8 | from .common import InfoExtractor | |
953e32b2 | 9 | |
1cc79574 | 10 | from ..utils import ( |
d3f007af | 11 | determine_ext, |
9b9c5355 | 12 | error_to_compat_str, |
7f8b2714 | 13 | ExtractorError, |
1cc79574 | 14 | int_or_none, |
d3f007af | 15 | parse_iso8601, |
5c2266df | 16 | sanitized_Request, |
f53c966a | 17 | str_to_int, |
4b10aadf | 18 | unescapeHTML, |
219b8130 PH |
19 | ) |
20 | ||
5f6a1245 | 21 | |
70922df8 JMF |
22 | class DailymotionBaseInfoExtractor(InfoExtractor): |
23 | @staticmethod | |
24 | def _build_request(url): | |
25 | """Build a request with the family filter disabled""" | |
5c2266df | 26 | request = sanitized_Request(url) |
2a0c2ca2 | 27 | request.add_header('Cookie', 'family_filter=off; ff=off') |
70922df8 | 28 | return request |
953e32b2 | 29 | |
12434026 YCH |
30 | def _download_webpage_handle_no_ff(self, url, *args, **kwargs): |
31 | request = self._build_request(url) | |
32 | return self._download_webpage_handle(request, *args, **kwargs) | |
33 | ||
d3f007af S |
34 | def _download_webpage_no_ff(self, url, *args, **kwargs): |
35 | request = self._build_request(url) | |
36 | return self._download_webpage(request, *args, **kwargs) | |
5f6a1245 | 37 | |
219b8130 | 38 | |
d3f007af | 39 | class DailymotionIE(DailymotionBaseInfoExtractor): |
9ee859b6 | 40 | _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' |
ce6815aa | 41 | IE_NAME = 'dailymotion' |
cdec0190 JMF |
42 | |
43 | _FORMATS = [ | |
ce6815aa PH |
44 | ('stream_h264_ld_url', 'ld'), |
45 | ('stream_h264_url', 'standard'), | |
46 | ('stream_h264_hq_url', 'hq'), | |
47 | ('stream_h264_hd_url', 'hd'), | |
48 | ('stream_h264_hd1080_url', 'hd180'), | |
cdec0190 JMF |
49 | ] |
50 | ||
c5428382 JMF |
51 | _TESTS = [ |
52 | { | |
23ba76bc JMF |
53 | 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', |
54 | 'md5': '2137c41a8e78554bb09225b8eb322406', | |
ce6815aa | 55 | 'info_dict': { |
23ba76bc | 56 | 'id': 'x2iuewm', |
ce6815aa | 57 | 'ext': 'mp4', |
23ba76bc | 58 | 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', |
d3f007af S |
59 | 'description': 'Several come bundled with the Steam Controller.', |
60 | 'thumbnail': 're:^https?:.*\.(?:jpg|png)$', | |
50aa2bb6 | 61 | 'duration': 74, |
d3f007af S |
62 | 'timestamp': 1425657362, |
63 | 'upload_date': '20150306', | |
64 | 'uploader': 'IGN', | |
65 | 'uploader_id': 'xijv66', | |
66 | 'age_limit': 0, | |
67 | 'view_count': int, | |
68 | 'comment_count': int, | |
c5428382 JMF |
69 | } |
70 | }, | |
71 | # Vevo video | |
72 | { | |
ce6815aa PH |
73 | 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', |
74 | 'info_dict': { | |
75 | 'title': 'Roar (Official)', | |
76 | 'id': 'USUV71301934', | |
77 | 'ext': 'mp4', | |
78 | 'uploader': 'Katy Perry', | |
79 | 'upload_date': '20130905', | |
c5428382 | 80 | }, |
ce6815aa PH |
81 | 'params': { |
82 | 'skip_download': True, | |
c5428382 | 83 | }, |
ce6815aa | 84 | 'skip': 'VEVO is only available in some countries', |
c5428382 | 85 | }, |
9f1109a5 PH |
86 | # age-restricted video |
87 | { | |
ce6815aa PH |
88 | 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', |
89 | 'md5': '0d667a7b9cebecc3c89ee93099c4159d', | |
90 | 'info_dict': { | |
91 | 'id': 'xyh2zz', | |
92 | 'ext': 'mp4', | |
93 | 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', | |
94 | 'uploader': 'HotWaves1012', | |
95 | 'age_limit': 18, | |
9f1109a5 | 96 | } |
583882fd LL |
97 | }, |
98 | # geo-restricted, player v5 | |
99 | { | |
100 | 'url': 'http://www.dailymotion.com/video/xhza0o', | |
101 | 'only_matching': True, | |
a8abf124 S |
102 | }, |
103 | # with subtitles | |
104 | { | |
105 | 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', | |
106 | 'only_matching': True, | |
9f1109a5 | 107 | } |
c5428382 | 108 | ] |
219b8130 PH |
109 | |
110 | def _real_extract(self, url): | |
b10609d9 | 111 | video_id = self._match_id(url) |
219b8130 | 112 | |
d3f007af S |
113 | webpage = self._download_webpage_no_ff( |
114 | 'https://www.dailymotion.com/video/%s' % video_id, video_id) | |
115 | ||
116 | age_limit = self._rta_search(webpage) | |
117 | ||
118 | description = self._og_search_description(webpage) or self._html_search_meta( | |
119 | 'description', webpage, 'description') | |
219b8130 | 120 | |
d3f007af S |
121 | view_count = str_to_int(self._search_regex( |
122 | [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"', | |
123 | r'video_views_count[^>]+>\s+([\d\.,]+)'], | |
124 | webpage, 'view count', fatal=False)) | |
125 | comment_count = int_or_none(self._search_regex( | |
126 | r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', | |
127 | webpage, 'comment count', fatal=False)) | |
128 | ||
129 | player_v5 = self._search_regex( | |
0b534d2a S |
130 | [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 |
131 | r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', | |
132 | r'buildPlayer\(({.+?})\);'], | |
d3f007af S |
133 | webpage, 'player v5', default=None) |
134 | if player_v5: | |
135 | player = self._parse_json(player_v5, video_id) | |
136 | metadata = player['metadata'] | |
583882fd LL |
137 | |
138 | self._check_error(metadata) | |
139 | ||
d3f007af S |
140 | formats = [] |
141 | for quality, media_list in metadata['qualities'].items(): | |
142 | for media in media_list: | |
143 | media_url = media.get('url') | |
144 | if not media_url: | |
145 | continue | |
146 | type_ = media.get('type') | |
147 | if type_ == 'application/vnd.lumberjack.manifest': | |
148 | continue | |
8e82ecfe S |
149 | ext = determine_ext(media_url) |
150 | if type_ == 'application/x-mpegURL' or ext == 'm3u8': | |
7e5edcfd | 151 | formats.extend(self._extract_m3u8_formats( |
16a34847 S |
152 | media_url, video_id, 'mp4', preference=-1, |
153 | m3u8_id='hls', fatal=False)) | |
8e82ecfe | 154 | elif type_ == 'application/f4m' or ext == 'f4m': |
7e5edcfd S |
155 | formats.extend(self._extract_f4m_formats( |
156 | media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) | |
d3f007af S |
157 | else: |
158 | f = { | |
159 | 'url': media_url, | |
16a34847 | 160 | 'format_id': 'http-%s' % quality, |
d3f007af S |
161 | } |
162 | m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) | |
163 | if m: | |
164 | f.update({ | |
165 | 'width': int(m.group('width')), | |
166 | 'height': int(m.group('height')), | |
167 | }) | |
168 | formats.append(f) | |
169 | self._sort_formats(formats) | |
170 | ||
171 | title = metadata['title'] | |
172 | duration = int_or_none(metadata.get('duration')) | |
173 | timestamp = int_or_none(metadata.get('created_time')) | |
174 | thumbnail = metadata.get('poster_url') | |
175 | uploader = metadata.get('owner', {}).get('screenname') | |
176 | uploader_id = metadata.get('owner', {}).get('id') | |
177 | ||
178 | subtitles = {} | |
cbd2ffd0 S |
179 | subtitles_data = metadata.get('subtitles', {}).get('data', {}) |
180 | if subtitles_data and isinstance(subtitles_data, dict): | |
181 | for subtitle_lang, subtitle in subtitles_data.items(): | |
182 | subtitles[subtitle_lang] = [{ | |
183 | 'ext': determine_ext(subtitle_url), | |
184 | 'url': subtitle_url, | |
185 | } for subtitle_url in subtitle.get('urls', [])] | |
d3f007af S |
186 | |
187 | return { | |
188 | 'id': video_id, | |
189 | 'title': title, | |
190 | 'description': description, | |
191 | 'thumbnail': thumbnail, | |
192 | 'duration': duration, | |
193 | 'timestamp': timestamp, | |
194 | 'uploader': uploader, | |
195 | 'uploader_id': uploader_id, | |
196 | 'age_limit': age_limit, | |
197 | 'view_count': view_count, | |
198 | 'comment_count': comment_count, | |
199 | 'formats': formats, | |
200 | 'subtitles': subtitles, | |
201 | } | |
219b8130 | 202 | |
d3f007af S |
203 | # vevo embed |
204 | vevo_id = self._search_regex( | |
9e05d039 | 205 | r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)', |
d3f007af S |
206 | webpage, 'vevo embed', default=None) |
207 | if vevo_id: | |
208 | return self.url_result('vevo:%s' % vevo_id, 'Vevo') | |
c5428382 | 209 | |
d3f007af S |
210 | # fallback old player |
211 | embed_page = self._download_webpage_no_ff( | |
212 | 'https://www.dailymotion.com/embed/video/%s' % video_id, | |
213 | video_id, 'Downloading embed page') | |
214 | ||
215 | timestamp = parse_iso8601(self._html_search_meta( | |
216 | 'video:release_date', webpage, 'upload date')) | |
217 | ||
218 | info = self._parse_json( | |
219 | self._search_regex( | |
220 | r'var info = ({.*?}),$', embed_page, | |
221 | 'video info', flags=re.MULTILINE), | |
222 | video_id) | |
219b8130 | 223 | |
583882fd | 224 | self._check_error(info) |
b27c856f | 225 | |
cdec0190 JMF |
226 | formats = [] |
227 | for (key, format_id) in self._FORMATS: | |
228 | video_url = info.get(key) | |
229 | if video_url is not None: | |
230 | m_size = re.search(r'H264-(\d+)x(\d+)', video_url) | |
231 | if m_size is not None: | |
553f6e46 | 232 | width, height = map(int_or_none, (m_size.group(1), m_size.group(2))) |
cdec0190 JMF |
233 | else: |
234 | width, height = None, None | |
235 | formats.append({ | |
236 | 'url': video_url, | |
237 | 'ext': 'mp4', | |
238 | 'format_id': format_id, | |
239 | 'width': width, | |
240 | 'height': height, | |
241 | }) | |
d3f007af | 242 | self._sort_formats(formats) |
b27c856f | 243 | |
953e32b2 | 244 | # subtitles |
1f343eaa | 245 | video_subtitles = self.extract_subtitles(video_id, webpage) |
953e32b2 | 246 | |
b10609d9 PH |
247 | title = self._og_search_title(webpage, default=None) |
248 | if title is None: | |
249 | title = self._html_search_regex( | |
250 | r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage, | |
251 | 'title') | |
f53c966a | 252 | |
9f1109a5 | 253 | return { |
b10609d9 | 254 | 'id': video_id, |
cdec0190 | 255 | 'formats': formats, |
85342674 | 256 | 'uploader': info['owner.screenname'], |
d3f007af | 257 | 'timestamp': timestamp, |
b10609d9 | 258 | 'title': title, |
d3f007af | 259 | 'description': description, |
b10609d9 | 260 | 'subtitles': video_subtitles, |
9f1109a5 PH |
261 | 'thumbnail': info['thumbnail_url'], |
262 | 'age_limit': age_limit, | |
f53c966a | 263 | 'view_count': view_count, |
50aa2bb6 | 264 | 'duration': info['duration'] |
9f1109a5 | 265 | } |
a3c736de | 266 | |
583882fd LL |
267 | def _check_error(self, info): |
268 | if info.get('error') is not None: | |
7593fbaa S |
269 | raise ExtractorError( |
270 | '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) | |
583882fd | 271 | |
a1f2a06b | 272 | def _get_subtitles(self, video_id, webpage): |
f8e52269 | 273 | try: |
7fad1c63 JMF |
274 | sub_list = self._download_webpage( |
275 | 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, | |
276 | video_id, note=False) | |
277 | except ExtractorError as err: | |
9b9c5355 | 278 | self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err)) |
f8e52269 IM |
279 | return {} |
280 | info = json.loads(sub_list) | |
281 | if (info['total'] > 0): | |
a1f2a06b | 282 | sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list']) |
f8e52269 | 283 | return sub_lang_list |
22a6f150 | 284 | self._downloader.report_warning('video doesn\'t have subtitles') |
f8e52269 IM |
285 | return {} |
286 | ||
a3c736de | 287 | |
70922df8 | 288 | class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): |
22a6f150 | 289 | IE_NAME = 'dailymotion:playlist' |
a3c736de | 290 | _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/' |
1e0a235f | 291 | _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' |
39baacc4 | 292 | _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' |
22a6f150 PH |
293 | _TESTS = [{ |
294 | 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', | |
295 | 'info_dict': { | |
296 | 'title': 'SPORT', | |
11e611a7 | 297 | 'id': 'xv4bw_nqtv_sport', |
22a6f150 PH |
298 | }, |
299 | 'playlist_mincount': 20, | |
300 | }] | |
a3c736de | 301 | |
39baacc4 | 302 | def _extract_entries(self, id): |
25a4c5a9 | 303 | video_ids = set() |
12434026 | 304 | processed_urls = set() |
a3c736de | 305 | for pagenum in itertools.count(1): |
12434026 YCH |
306 | page_url = self._PAGE_TEMPLATE % (id, pagenum) |
307 | webpage, urlh = self._download_webpage_handle_no_ff( | |
308 | page_url, id, 'Downloading page %s' % pagenum) | |
309 | if urlh.geturl() in processed_urls: | |
310 | self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( | |
311 | page_url, urlh.geturl()), id) | |
312 | break | |
313 | ||
314 | processed_urls.add(urlh.geturl()) | |
a3c736de | 315 | |
25a4c5a9 JMF |
316 | for video_id in re.findall(r'data-xid="(.+?)"', webpage): |
317 | if video_id not in video_ids: | |
318 | yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') | |
319 | video_ids.add(video_id) | |
a3c736de | 320 | |
1e0a235f | 321 | if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: |
a3c736de | 322 | break |
39baacc4 JMF |
323 | |
324 | def _real_extract(self, url): | |
325 | mobj = re.match(self._VALID_URL, url) | |
326 | playlist_id = mobj.group('id') | |
327 | webpage = self._download_webpage(url, playlist_id) | |
328 | ||
b0fb63ab PH |
329 | return { |
330 | '_type': 'playlist', | |
331 | 'id': playlist_id, | |
332 | 'title': self._og_search_title(webpage), | |
333 | 'entries': self._extract_entries(playlist_id), | |
334 | } | |
39baacc4 JMF |
335 | |
336 | ||
337 | class DailymotionUserIE(DailymotionPlaylistIE): | |
22a6f150 | 338 | IE_NAME = 'dailymotion:user' |
2b2ee140 | 339 | _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' |
39baacc4 | 340 | _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' |
22a6f150 PH |
341 | _TESTS = [{ |
342 | 'url': 'https://www.dailymotion.com/user/nqtv', | |
343 | 'info_dict': { | |
344 | 'id': 'nqtv', | |
345 | 'title': 'Rémi Gaillard', | |
346 | }, | |
347 | 'playlist_mincount': 100, | |
12434026 YCH |
348 | }, { |
349 | 'url': 'http://www.dailymotion.com/user/UnderProject', | |
350 | 'info_dict': { | |
351 | 'id': 'UnderProject', | |
352 | 'title': 'UnderProject', | |
353 | }, | |
354 | 'playlist_mincount': 1800, | |
355 | 'expected_warnings': [ | |
356 | 'Stopped at duplicated page', | |
357 | ], | |
358 | 'skip': 'Takes too long time', | |
22a6f150 | 359 | }] |
39baacc4 JMF |
360 | |
361 | def _real_extract(self, url): | |
362 | mobj = re.match(self._VALID_URL, url) | |
363 | user = mobj.group('user') | |
7d65242d S |
364 | webpage = self._download_webpage( |
365 | 'https://www.dailymotion.com/user/%s' % user, user) | |
4b10aadf S |
366 | full_user = unescapeHTML(self._html_search_regex( |
367 | r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user), | |
22a6f150 | 368 | webpage, 'user')) |
39baacc4 JMF |
369 | |
370 | return { | |
371 | '_type': 'playlist', | |
372 | 'id': user, | |
373 | 'title': full_user, | |
374 | 'entries': self._extract_entries(user), | |
375 | } | |
756f574e YCH |
376 | |
377 | ||
378 | class DailymotionCloudIE(DailymotionBaseInfoExtractor): | |
0bcdc276 S |
379 | _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' |
380 | _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX | |
381 | _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX | |
756f574e | 382 | |
0bcdc276 | 383 | _TESTS = [{ |
756f574e YCH |
384 | # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html |
385 | # Tested at FranceTvInfo_2 | |
386 | 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', | |
387 | 'only_matching': True, | |
0bcdc276 S |
388 | }, { |
389 | # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html | |
390 | 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1', | |
391 | 'only_matching': True, | |
392 | }] | |
756f574e YCH |
393 | |
394 | @classmethod | |
395 | def _extract_dmcloud_url(self, webpage): | |
0bcdc276 | 396 | mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage) |
756f574e YCH |
397 | if mobj: |
398 | return mobj.group(1) | |
399 | ||
0bcdc276 S |
400 | mobj = re.search( |
401 | r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, | |
402 | webpage) | |
756f574e YCH |
403 | if mobj: |
404 | return mobj.group(1) | |
405 | ||
406 | def _real_extract(self, url): | |
407 | video_id = self._match_id(url) | |
408 | ||
d3f007af | 409 | webpage = self._download_webpage_no_ff(url, video_id) |
756f574e YCH |
410 | |
411 | title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title') | |
412 | ||
413 | video_info = self._parse_json(self._search_regex( | |
414 | r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) | |
415 | ||
416 | # TODO: parse ios_url, which is in fact a manifest | |
417 | video_url = video_info['mp4_url'] | |
418 | ||
419 | return { | |
420 | 'id': video_id, | |
421 | 'url': video_url, | |
422 | 'title': title, | |
423 | 'thumbnail': video_info.get('thumbnail_url'), | |
424 | } |