]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[ffmpeg] Warn if ffmpeg/avconv version is too old (Fixes #4026)
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
9b122384 10from ..utils import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
9b122384 14
b759a0d4 15 determine_ext,
9b122384 16 ExtractorError,
c8e9a235 17 float_or_none,
aa94a6d3 18 HEADRequest,
ed2d6a19 19 orderedSet,
bcf89ce6 20 parse_xml,
9d4660ca
PH
21 smuggle_url,
22 unescapeHTML,
42393ce2 23 unified_strdate,
4d54ef20 24 unsmuggle_url,
42393ce2 25 url_basename,
9b122384 26)
cfe50f04 27from .brightcove import BrightcoveIE
c0d0b01f 28from .ooyala import OoyalaIE
93d020dd 29from .rutv import RUTVIE
cb3ac1c6 30from .smotri import SmotriIE
1419fafd 31from .condenast import CondeNastIE
9b122384 32
0838239e 33
9b122384 34class GenericIE(InfoExtractor):
79649588 35 IE_DESC = 'Generic downloader that works on some sites'
9b122384 36 _VALID_URL = r'.*'
79649588 37 IE_NAME = 'generic'
cfe50f04
JMF
38 _TESTS = [
39 {
79649588 40 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 41 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 42 'info_dict': {
d360a146
S
43 'id': '13601338388002',
44 'ext': 'mp4',
79649588
PH
45 'uploader': 'www.hodiho.fr',
46 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
47 }
48 },
c19f7764
JMF
49 # bandcamp page with custom domain
50 {
79649588
PH
51 'add_ie': ['Bandcamp'],
52 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 53 'info_dict': {
fd50bf62
S
54 'id': '3235767654',
55 'ext': 'mp3',
79649588
PH
56 'title': 'The Pony Mash',
57 'uploader': 'M_Pallante',
c19f7764 58 },
79649588 59 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 60 },
eeb165e6 61 # embedded brightcove video
dd5bcdc4
JMF
62 # it also tests brightcove videos that need to set the 'Referer' in the
63 # http requests
eeb165e6 64 {
79649588
PH
65 'add_ie': ['Brightcove'],
66 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
67 'info_dict': {
68 'id': '2765128793001',
69 'ext': 'mp4',
70 'title': 'Le cours de bourse : l’analyse technique',
71 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
72 'uploader': 'BFM BUSINESS',
eeb165e6 73 },
79649588
PH
74 'params': {
75 'skip_download': True,
eeb165e6
JMF
76 },
77 },
17ab4d3b
PH
78 {
79 # https://github.com/rg3/youtube-dl/issues/2253
80 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
81 'md5': '0ba9446db037002366bab3b3eb30c88c',
82 'info_dict': {
fd50bf62
S
83 'id': '3101154703001',
84 'ext': 'mp4',
17ab4d3b
PH
85 'title': 'Still no power',
86 'uploader': 'thestar.com',
87 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
88 },
89 'add_ie': ['Brightcove'],
90 },
0479c625
S
91 {
92 'url': 'http://www.championat.com/video/football/v/87/87499.html',
93 'md5': 'fb973ecf6e4a78a67453647444222983',
94 'info_dict': {
95 'id': '3414141473001',
96 'ext': 'mp4',
97 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
98 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
99 'uploader': 'Championat',
100 },
101 },
42393ce2
PH
102 # Direct link to a video
103 {
79649588 104 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
105 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
106 'info_dict': {
107 'id': 'trailer',
89ef304b 108 'ext': 'mp4',
79649588
PH
109 'title': 'trailer',
110 'upload_date': '20100513',
42393ce2 111 }
c0d0b01f
JMF
112 },
113 # ooyala video
114 {
79649588
PH
115 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
116 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
117 'info_dict': {
118 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
119 'ext': 'mp4',
3486df38 120 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f
JMF
121 },
122 },
89ef304b
PH
123 # google redirect
124 {
125 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
126 'info_dict': {
127 'id': 'cmQHVoWB5FY',
128 'ext': 'mp4',
129 'upload_date': '20130224',
130 'uploader_id': 'TheVerge',
131 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
132 'uploader': 'The Verge',
133 'title': 'First Firefox OS phones side-by-side',
134 },
135 'params': {
136 'skip_download': False,
137 }
f55a1f0a 138 },
1b86cc41 139 # embed.ly video
140 {
141 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
142 'info_dict': {
143 'id': '9ODmcdjQcHQ',
144 'ext': 'mp4',
0a5bce56
PH
145 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
146 'upload_date': '20140225',
147 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
148 'uploader': 'Tested',
149 'uploader_id': 'testedcom',
1b86cc41 150 },
151 # No need to test YoutubeIE here
152 'params': {
153 'skip_download': True,
154 },
155 },
60cc4dc4
PH
156 # funnyordie embed
157 {
158 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
159 'info_dict': {
160 'id': '18e820ec3f',
161 'ext': 'mp4',
162 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
163 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 164 },
60cc4dc4 165 },
93d020dd
S
166 # RUTV embed
167 {
168 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
169 'info_dict': {
170 'id': '776940',
171 'ext': 'mp4',
172 'title': 'Охотское море стало целиком российским',
173 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
174 },
175 'params': {
176 # m3u8 download
177 'skip_download': True,
178 },
aab74fa1
PH
179 },
180 # Embedded TED video
181 {
182 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 183 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 184 'info_dict': {
a8eb5a8e 185 'id': '1969',
aab74fa1 186 'ext': 'mp4',
a8eb5a8e
PH
187 'title': 'Hidden miracles of the natural world',
188 'uploader': 'Louie Schwartzberg',
189 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 190 }
60cc4dc4 191 },
5c386252 192 # Embeded Ustream video
193 {
194 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
195 'md5': '27b99cdb639c9b12a79bca876a073417',
196 'info_dict': {
ca6aada4 197 'id': '45734260',
198 'ext': 'flv',
199 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 200 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
201 }
202 },
d95e35d6
S
203 # nowvideo embed hidden behind percent encoding
204 {
205 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
206 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
207 'info_dict': {
208 'id': '06e53103ca9aa',
209 'ext': 'flv',
210 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
211 'description': 'No description',
212 },
0f2a2ba1 213 },
893f8832
PH
214 # arte embed
215 {
216 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
217 'md5': '7653032cbb25bf6c80d80f217055fa43',
218 'info_dict': {
219 'id': '048195-004_PLUS7-F',
220 'ext': 'flv',
221 'title': 'X:enius',
222 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
223 'upload_date': '20140320',
224 },
225 'params': {
226 'skip_download': 'Requires rtmpdump'
227 }
228 },
fa35cdad
PH
229 # Condé Nast embed
230 {
231 'url': 'http://www.wired.com/2014/04/honda-asimo/',
232 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
233 'info_dict': {
234 'id': '53501be369702d3275860000',
235 'ext': 'mp4',
236 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
237 }
ebd3c7b3
PH
238 },
239 # Dailymotion embed
240 {
241 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
242 'md5': '441aeeb82eb72c422c7f14ec533999cd',
243 'info_dict': {
244 'id': 'k2mm4bCdJ6CQ2i7c8o2',
245 'ext': 'mp4',
246 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
247 'uploader': 'Spi0n',
248 },
249 'add_ie': ['Dailymotion'],
2b88feed
PH
250 },
251 # YouTube embed
252 {
253 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
254 'info_dict': {
255 'id': 'FXRb4ykk4S0',
256 'ext': 'mp4',
257 'title': 'The NBL Auction 2014',
258 'uploader': 'BADMINTON England',
259 'uploader_id': 'BADMINTONEvents',
260 'upload_date': '20140603',
261 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
262 },
263 'add_ie': ['Youtube'],
264 'params': {
265 'skip_download': True,
266 }
267 },
c5cd249e
JMF
268 # MTVSercices embed
269 {
270 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
271 'md5': '35727f82f58c76d996fc188f9755b0d5',
272 'info_dict': {
273 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
274 'ext': 'mp4',
275 'title': 'Review',
276 'description': 'Mario\'s life in the fast lane has never looked so good.',
277 },
278 },
61013473 279 # YouTube embed via <data-embed-url="">
280 {
281 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 282 'info_dict': {
a8eb5a8e 283 'id': '4vAffPZIT44',
61013473 284 'ext': 'mp4',
a8eb5a8e 285 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
286 'uploader': 'Gameloft',
287 'uploader_id': 'gameloft',
a8eb5a8e
PH
288 'upload_date': '20140828',
289 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
290 },
291 'params': {
292 'skip_download': True,
61013473 293 }
c8e9a235
PH
294 },
295 # Camtasia studio
296 {
297 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
298 'playlist': [{
299 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
300 'info_dict': {
301 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
302 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
303 'ext': 'flv',
304 'duration': 2235.90,
305 }
306 }, {
307 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
308 'info_dict': {
309 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
310 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
311 'ext': 'flv',
312 'duration': 2235.93,
313 }
314 }],
315 'info_dict': {
316 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
317 }
4d805e06
PH
318 },
319 # Flowplayer
320 {
321 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
322 'md5': '9d65602bf31c6e20014319c7d07fba27',
323 'info_dict': {
324 'id': '5123ea6d5e5a7',
325 'ext': 'mp4',
326 'age_limit': 18,
327 'uploader': 'www.handjobhub.com',
328 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
329 }
0990305d
PH
330 },
331 # RSS feed
332 {
333 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
334 'info_dict': {
335 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
336 'title': 'Zero Punctuation',
337 'description': 're:'
338 },
339 'playlist_mincount': 11,
22a6f150
PH
340 },
341 # Multiple brightcove videos
342 # https://github.com/rg3/youtube-dl/issues/2283
343 {
344 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
345 'info_dict': {
346 'id': 'always-never',
347 'title': 'Always / Never - The New Yorker',
348 },
349 'playlist_count': 3,
350 'params': {
351 'extract_flat': False,
352 'skip_download': True,
353 }
1a94ff68
S
354 },
355 # MLB embed
356 {
357 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
358 'md5': '96f09a37e44da40dd083e12d9a683327',
359 'info_dict': {
360 'id': '33322633',
361 'ext': 'mp4',
362 'title': 'Ump changes call to ball',
363 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
364 'duration': 48,
365 'timestamp': 1401537900,
366 'upload_date': '20140531',
367 'thumbnail': 're:^https?://.*\.jpg$',
368 },
369 },
746c67d7
NJ
370 # Wistia embed
371 {
372 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
373 'md5': '8788b683c777a5cf25621eaf286d0c23',
374 'info_dict': {
375 'id': '1cfaf6b7ea',
376 'ext': 'mov',
377 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
378 'duration': 643.0,
379 'filesize': 182808282,
380 'uploader': 'education-portal.com',
381 },
382 },
52cffcb1 383 {
384 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
385 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
386 'info_dict': {
387 'id': 'uxjb0lwrcz',
388 'ext': 'mp4',
85d7b765 389 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 390 'duration': 1715.0,
85d7b765 391 'uploader': 'thoughtworks.wistia.com',
52cffcb1 392 },
393 },
cfe50f04 394 ]
9b122384 395
9b122384
PH
396 def report_following_redirect(self, new_url):
397 """Report information extraction."""
79649588 398 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 399
4fc946b5
PH
400 def _extract_rss(self, url, video_id, doc):
401 playlist_title = doc.find('./channel/title').text
402 playlist_desc_el = doc.find('./channel/description')
403 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
404
405 entries = [{
406 '_type': 'url',
407 'url': e.find('link').text,
408 'title': e.find('title').text,
409 } for e in doc.findall('./channel/item')]
410
411 return {
412 '_type': 'playlist',
413 'id': url,
414 'title': playlist_title,
415 'description': playlist_desc,
416 'entries': entries,
417 }
418
c8e9a235
PH
419 def _extract_camtasia(self, url, video_id, webpage):
420 """ Returns None if no camtasia video can be found. """
421
422 camtasia_cfg = self._search_regex(
423 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
424 webpage, 'camtasia configuration file', default=None)
425 if camtasia_cfg is None:
426 return None
427
428 title = self._html_search_meta('DC.title', webpage, fatal=True)
429
430 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
431 camtasia_cfg = self._download_xml(
432 camtasia_url, video_id,
433 note='Downloading camtasia configuration',
434 errnote='Failed to download camtasia configuration')
435 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
436
437 entries = []
438 for n in fileset_node.getchildren():
439 url_n = n.find('./uri')
440 if url_n is None:
441 continue
442
443 entries.append({
444 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
445 'title': '%s - %s' % (title, n.tag),
446 'url': compat_urlparse.urljoin(url, url_n.text),
447 'duration': float_or_none(n.find('./duration').text),
448 })
449
450 return {
451 '_type': 'playlist',
452 'entries': entries,
453 'title': title,
454 }
455
9b122384 456 def _real_extract(self, url):
ebd3c7b3
PH
457 if url.startswith('//'):
458 return {
459 '_type': 'url',
20991253 460 'url': self.http_scheme() + url,
ebd3c7b3
PH
461 }
462
a7130543
JMF
463 parsed_url = compat_urlparse.urlparse(url)
464 if not parsed_url.scheme:
04b4d394
PH
465 default_search = self._downloader.params.get('default_search')
466 if default_search is None:
1f7ccb90 467 default_search = 'fixup_error'
04b4d394 468
1f7ccb90 469 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
470 if '/' in url:
471 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
472 return self.url_result('http://' + url)
1f7ccb90 473 elif default_search != 'fixup_error':
9c1fc022 474 if default_search == 'auto_warning':
0e67ab0d
PH
475 if re.match(r'^(?:url|URL)$', url):
476 raise ExtractorError(
477 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
478 expected=True)
479 else:
480 self._downloader.report_warning(
7571c02c 481 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 482 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
483
484 if default_search in ('error', 'fixup_error'):
7571c02c
PH
485 raise ExtractorError(
486 ('%r is not a valid URL. '
eef4a7a3 487 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
7571c02c 488 ) % (url, url), expected=True)
04b4d394 489 else:
f2f2c0c2
PH
490 if ':' not in default_search:
491 default_search += ':'
04b4d394 492 return self.url_result(default_search + url)
4d54ef20
PH
493
494 url, smuggled_data = unsmuggle_url(url)
495 force_videoid = None
d6e6a422 496 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
497 if smuggled_data and 'force_videoid' in smuggled_data:
498 force_videoid = smuggled_data['force_videoid']
499 video_id = force_videoid
500 else:
501 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 502
79649588 503 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 504
ebab4520
PH
505 head_req = HEADRequest(url)
506 response = self._request_webpage(
507 head_req, video_id,
508 note=False, errnote='Could not send HEAD request to %s' % url,
509 fatal=False)
42393ce2 510
ebab4520 511 if response is not False:
42393ce2
PH
512 # Check for redirect
513 new_url = response.geturl()
514 if url != new_url:
515 self.report_following_redirect(new_url)
4d54ef20
PH
516 if force_videoid:
517 new_url = smuggle_url(
518 new_url, {'force_videoid': force_videoid})
cecaaf3f 519 return self.url_result(new_url)
42393ce2
PH
520
521 # Check for direct link to a video
522 content_type = response.headers.get('Content-Type', '')
3e785145 523 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
42393ce2
PH
524 if m:
525 upload_date = response.headers.get('Last-Modified')
526 if upload_date:
527 upload_date = unified_strdate(upload_date)
42393ce2
PH
528 return {
529 'id': video_id,
530 'title': os.path.splitext(url_basename(url))[0],
531 'formats': [{
532 'format_id': m.group('format_id'),
533 'url': url,
79649588 534 'vcodec': 'none' if m.group('type') == 'audio' else None
42393ce2
PH
535 }],
536 'upload_date': upload_date,
537 }
538
d6e6a422
PH
539 if not self._downloader.params.get('test', False) and not is_intentional:
540 self._downloader.report_warning('Falling back on generic information extractor.')
541
9b122384
PH
542 try:
543 webpage = self._download_webpage(url, video_id)
544 except ValueError:
545 # since this is the last-resort InfoExtractor, if
546 # this error is thrown, it'll be thrown here
79649588 547 raise ExtractorError('Failed to download URL: %s' % url)
9b122384
PH
548
549 self.report_extraction(video_id)
887c6acd 550
4fc946b5
PH
551 # Is it an RSS feed?
552 try:
bcf89ce6 553 doc = parse_xml(webpage)
4fc946b5
PH
554 if doc.tag == 'rss':
555 return self._extract_rss(url, video_id, doc)
f7300c5c 556 except compat_xml_parse_error:
4fc946b5
PH
557 pass
558
c8e9a235
PH
559 # Is it a Camtasia project?
560 camtasia_res = self._extract_camtasia(url, video_id, webpage)
561 if camtasia_res is not None:
562 return camtasia_res
563
14390730
S
564 # Sometimes embedded video player is hidden behind percent encoding
565 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
566 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
567 webpage = compat_urllib_parse.unquote(webpage)
568
887c6acd
PH
569 # it's tempting to parse this further, but you would
570 # have to take into account all the variations like
571 # Video Title - Site Name
572 # Site Name | Video Title
573 # Video Title - Tagline | Site Name
574 # and so on and so forth; it's just not practical
ef4fd848 575 video_title = self._html_search_regex(
79649588
PH
576 r'(?s)<title>(.*?)</title>', webpage, 'video title',
577 default='video')
ef4fd848 578
4d805e06
PH
579 # Try to detect age limit automatically
580 age_limit = self._rta_search(webpage)
581 # And then there are the jokers who advertise that they use RTA,
582 # but actually don't.
583 AGE_LIMIT_MARKERS = [
584 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
585 ]
586 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
587 age_limit = 18
588
ef4fd848
PH
589 # video uploader is domain name
590 video_uploader = self._search_regex(
79649588 591 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 592
ed2d6a19
PH
593 # Helper method
594 def _playlist_from_matches(matches, getter, ie=None):
3b2f933b
PH
595 urlrs = orderedSet(
596 self.url_result(self._proto_relative_url(getter(m)), ie)
597 for m in matches)
ed2d6a19
PH
598 return self.playlist_result(
599 urlrs, playlist_id=video_id, playlist_title=video_title)
600
627a91a9 601 # Look for BrightCove:
99877772
PH
602 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
603 if bc_urls:
79649588 604 self.to_screen('Brightcove video detected.')
99877772
PH
605 entries = [{
606 '_type': 'url',
607 'url': smuggle_url(bc_url, {'Referer': url}),
608 'ie_key': 'Brightcove'
609 } for bc_url in bc_urls]
610
611 return {
612 '_type': 'playlist',
613 'title': video_title,
614 'id': video_id,
615 'entries': entries,
616 }
cfe50f04 617
7115ca84 618 # Look for embedded (iframe) Vimeo player
9d4660ca 619 mobj = re.search(
15fd51b3 620 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 621 if mobj:
15fd51b3 622 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 623 surl = smuggle_url(player_url, {'Referer': url})
09a42738 624 return self.url_result(surl)
9d4660ca 625
7115ca84
PH
626 # Look for embedded (swf embed) Vimeo player
627 mobj = re.search(
09a42738 628 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 629 if mobj:
09a42738 630 return self.url_result(mobj.group(1))
7115ca84 631
53c1d3ef 632 # Look for embedded YouTube player
1f9da904 633 matches = re.findall(r'''(?x)
2b88feed
PH
634 (?:
635 <iframe[^>]+?src=|
c71dfccc 636 data-video-url=|
2b88feed 637 <embed[^>]+?src=|
a7e97f6d
PH
638 embedSWF\(?:\s*|
639 new\s+SWFObject\(
2b88feed
PH
640 )
641 (["\'])
1bf5423e 642 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 643 (?:embed|v|p)/.+?)
1f9da904 644 \1''', webpage)
887c6acd 645 if matches:
ed2d6a19 646 return _playlist_from_matches(
3b2f933b 647 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 648
355e4fd0
PH
649 # Look for embedded Dailymotion player
650 matches = re.findall(
ef4fd848 651 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 652 if matches:
ed2d6a19
PH
653 return _playlist_from_matches(
654 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 655
8489578d
NJ
656 # Look for embedded Dailymotion playlist player (#3822)
657 m = re.search(
658 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
659 if m:
660 playlists = re.findall(
661 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
662 if playlists:
663 return _playlist_from_matches(
664 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
665
ef4fd848
PH
666 # Look for embedded Wistia player
667 match = re.search(
281d3f1d 668 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 669 if match:
9471c444
NJ
670 embed_url = self._proto_relative_url(
671 unescapeHTML(match.group('url')))
ef4fd848
PH
672 return {
673 '_type': 'url_transparent',
9471c444 674 'url': embed_url,
ef4fd848
PH
675 'ie_key': 'Wistia',
676 'uploader': video_uploader,
677 'title': video_title,
678 'id': video_id,
679 }
52cffcb1 680
9471c444 681 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
682 if match:
683 return {
684 '_type': 'url_transparent',
685 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
686 'ie_key': 'Wistia',
687 'uploader': video_uploader,
688 'title': video_title,
689 'id': match.group('id')
690 }
ef4fd848 691
ee3e63e4 692 # Look for embedded blip.tv player
19dab5e6 693 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 694 if mobj:
19dab5e6 695 return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
1f8b6af7 696 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 697 if mobj:
19dab5e6 698 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 699
fa35cdad
PH
700 # Look for embedded condenast player
701 matches = re.findall(
702 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
703 webpage)
704 if matches:
705 return {
706 '_type': 'playlist',
707 'entries': [{
708 '_type': 'url',
709 'ie_key': 'CondeNast',
710 'url': ma,
711 } for ma in matches],
712 'title': video_title,
713 'id': video_id,
714 }
715
c19f7764
JMF
716 # Look for Bandcamp pages with custom domain
717 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
718 if mobj is not None:
719 burl = unescapeHTML(mobj.group(1))
09804265
JMF
720 # Don't set the extractor because it can be a track url or an album
721 return self.url_result(burl)
c19f7764 722
f25571ff
PH
723 # Look for embedded Vevo player
724 mobj = re.search(
725 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
726 if mobj is not None:
727 return self.url_result(mobj.group('url'))
728
c0d0b01f 729 # Look for Ooyala videos
750f9020
JMF
730 mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
731 re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 732 if mobj is not None:
750f9020 733 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 734
aa94a6d3 735 # Look for Aparat videos
48099643 736 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
737 if mobj is not None:
738 return self.url_result(mobj.group(1), 'Aparat')
739
c93c2ab1 740 # Look for MPORA videos
c3f51436 741 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
742 if mobj is not None:
743 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 744
15c0e8e7 745 # Look for embedded NovaMov-based player
8f89e687 746 mobj = re.search(
8dfa187b 747 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
748 (?P<url>http://(?:(?:embed|www)\.)?
749 (?:novamov\.com|
750 nowvideo\.(?:ch|sx|eu|at|ag|co)|
751 videoweed\.(?:es|com)|
752 movshare\.(?:net|sx|ag)|
753 divxstage\.(?:eu|net|ch|co|at|ag))
754 /embed\.php.+?)\1''', webpage)
8f89e687 755 if mobj is not None:
15c0e8e7 756 return self.url_result(mobj.group('url'))
50f56607 757
9834872b
PH
758 # Look for embedded Facebook player
759 mobj = re.search(
db1f3888 760 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
761 if mobj is not None:
762 return self.url_result(mobj.group('url'), 'Facebook')
763
ca97a56e
S
764 # Look for embedded VK player
765 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
766 if mobj is not None:
767 return self.url_result(mobj.group('url'), 'VK')
768
0364fa8b
S
769 # Look for embedded ivi player
770 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
771 if mobj is not None:
772 return self.url_result(mobj.group('url'), 'Ivi')
773
db1f3888
PH
774 # Look for embedded Huffington Post player
775 mobj = re.search(
c3f51436 776 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
777 if mobj is not None:
778 return self.url_result(mobj.group('url'), 'HuffPost')
779
1b86cc41 780 # Look for embed.ly
781 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
782 if mobj is not None:
783 return self.url_result(mobj.group('url'))
784 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
785 if mobj is not None:
786 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
787
60cc4dc4
PH
788 # Look for funnyordie embed
789 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
790 if matches:
ed2d6a19
PH
791 return _playlist_from_matches(
792 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 793
93d020dd
S
794 # Look for embedded RUTV player
795 rutv_url = RUTVIE._extract_url(webpage)
796 if rutv_url:
797 return self.url_result(rutv_url, 'RUTV')
798
7e2ede98
JMF
799 # Look for embedded TED player
800 mobj = re.search(
801 r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
802 if mobj is not None:
803 return self.url_result(mobj.group('url'), 'TED')
804
5c386252 805 # Look for embedded Ustream videos
806 mobj = re.search(
807 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
808 if mobj is not None:
809 return self.url_result(mobj.group('url'), 'Ustream')
810
893f8832
PH
811 # Look for embedded arte.tv player
812 mobj = re.search(
813 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
814 webpage)
815 if mobj is not None:
816 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
817
cb3ac1c6
S
818 # Look for embedded smotri.com player
819 smotri_url = SmotriIE._extract_url(webpage)
820 if smotri_url:
821 return self.url_result(smotri_url, 'Smotri')
822
20991253
PH
823 # Look for embeded soundcloud player
824 mobj = re.search(
825 r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
826 webpage)
827 if mobj is not None:
828 url = unescapeHTML(mobj.group('url'))
829 return self.url_result(url)
830
826ec77f
PH
831 # Look for embedded vulture.com player
832 mobj = re.search(
833 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
834 webpage)
835 if mobj is not None:
836 url = unescapeHTML(mobj.group('url'))
837 return self.url_result(url, ie='Vulture')
838
c5cd249e
JMF
839 # Look for embedded mtvservices player
840 mobj = re.search(
841 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
842 webpage)
843 if mobj is not None:
844 url = unescapeHTML(mobj.group('url'))
845 return self.url_result(url, ie='MTVServicesEmbedded')
846
49807b4a
S
847 # Look for embedded yahoo player
848 mobj = re.search(
849 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
850 webpage)
851 if mobj is not None:
852 return self.url_result(mobj.group('url'), 'Yahoo')
853
2ef6fcb5
PH
854 # Look for embedded sbs.com.au player
855 mobj = re.search(
856 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
857 webpage)
858 if mobj is not None:
859 return self.url_result(mobj.group('url'), 'SBS')
860
1a94ff68
S
861 mobj = re.search(
862 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
863 webpage)
864 if mobj is not None:
865 return self.url_result(mobj.group('url'), 'MLB')
866
1419fafd
S
867 mobj = re.search(
868 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
869 webpage)
870 if mobj is not None:
871 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
872
ced659bb
S
873 def check_video(vurl):
874 vpath = compat_urlparse.urlparse(vurl).path
875 vext = determine_ext(vpath)
876 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
877
878 def filter_video(urls):
879 return list(filter(check_video, urls))
880
9b122384 881 # Start with something easy: JW Player in SWFObject
ced659bb 882 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 883 if not found:
d981cef6 884 # Look for gorilla-vid style embedding
ced659bb 885 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
886 (?:
887 jw_plugins|
888 JWPlayerOptions|
889 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
890 )
ced659bb 891 .*?file\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 892 if not found:
9b122384 893 # Broaden the search a little bit
ced659bb 894 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
895 if not found:
896 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
897 found = filter_video(re.findall(
898 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
899 if not found:
900 # Flow player
ced659bb 901 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
902 flowplayer\("[^"]+",\s*
903 \{[^}]+?\}\s*,
904 \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
905 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 906 ''', webpage))
b30b8698 907 if not found:
9b122384 908 # Try to find twitter cards info
ced659bb
S
909 found = filter_video(re.findall(
910 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 911 if not found:
9b122384
PH
912 # We look for Open Graph info:
913 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 914 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
915 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
916 if m_video_type is not None:
ced659bb 917 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 918 if not found:
7fea7156 919 # HTML5 video
aff216ed 920 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
b30b8698 921 if not found:
a5a45015 922 found = re.search(
89ef304b 923 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 924 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 925 webpage)
b30b8698
PH
926 if found:
927 new_url = found.group(1)
89ef304b
PH
928 self.report_following_redirect(new_url)
929 return {
930 '_type': 'url',
931 'url': new_url,
932 }
b30b8698 933 if not found:
79649588 934 raise ExtractorError('Unsupported URL: %s' % url)
9b122384 935
b30b8698
PH
936 entries = []
937 for video_url in found:
938 video_url = compat_urlparse.urljoin(url, video_url)
939 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 940
b30b8698
PH
941 # Sometimes, jwplayer extraction will result in a YouTube URL
942 if YoutubeIE.suitable(video_url):
943 entries.append(self.url_result(video_url, 'Youtube'))
944 continue
9b122384 945
b30b8698
PH
946 # here's a fun little line of code for you:
947 video_id = os.path.splitext(video_id)[0]
fc9713a1 948
b30b8698
PH
949 entries.append({
950 'id': video_id,
951 'url': video_url,
952 'uploader': video_uploader,
953 'title': video_title,
4d805e06 954 'age_limit': age_limit,
b30b8698
PH
955 })
956
957 if len(entries) == 1:
669f0e7c 958 return entries[0]
b30b8698
PH
959 else:
960 for num, e in enumerate(entries, start=1):
961 e['title'] = '%s (%d)' % (e['title'], num)
962 return {
963 '_type': 'playlist',
964 'entries': entries,
965 }
9b122384 966