]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[gorillavid] Fix test title
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
9b122384 10from ..utils import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
9b122384 14
b759a0d4 15 determine_ext,
9b122384 16 ExtractorError,
c8e9a235 17 float_or_none,
aa94a6d3 18 HEADRequest,
ed2d6a19 19 orderedSet,
bcf89ce6 20 parse_xml,
9d4660ca
PH
21 smuggle_url,
22 unescapeHTML,
42393ce2 23 unified_strdate,
4d54ef20 24 unsmuggle_url,
42393ce2 25 url_basename,
9b122384 26)
cfe50f04 27from .brightcove import BrightcoveIE
c0d0b01f 28from .ooyala import OoyalaIE
93d020dd 29from .rutv import RUTVIE
cb3ac1c6 30from .smotri import SmotriIE
1419fafd 31from .condenast import CondeNastIE
9b122384 32
0838239e 33
9b122384 34class GenericIE(InfoExtractor):
79649588 35 IE_DESC = 'Generic downloader that works on some sites'
9b122384 36 _VALID_URL = r'.*'
79649588 37 IE_NAME = 'generic'
cfe50f04
JMF
38 _TESTS = [
39 {
79649588 40 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 41 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 42 'info_dict': {
d360a146
S
43 'id': '13601338388002',
44 'ext': 'mp4',
79649588
PH
45 'uploader': 'www.hodiho.fr',
46 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
47 }
48 },
c19f7764
JMF
49 # bandcamp page with custom domain
50 {
79649588
PH
51 'add_ie': ['Bandcamp'],
52 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 53 'info_dict': {
fd50bf62
S
54 'id': '3235767654',
55 'ext': 'mp3',
79649588
PH
56 'title': 'The Pony Mash',
57 'uploader': 'M_Pallante',
c19f7764 58 },
79649588 59 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 60 },
eeb165e6 61 # embedded brightcove video
dd5bcdc4
JMF
62 # it also tests brightcove videos that need to set the 'Referer' in the
63 # http requests
eeb165e6 64 {
79649588
PH
65 'add_ie': ['Brightcove'],
66 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
67 'info_dict': {
68 'id': '2765128793001',
69 'ext': 'mp4',
70 'title': 'Le cours de bourse : l’analyse technique',
71 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
72 'uploader': 'BFM BUSINESS',
eeb165e6 73 },
79649588
PH
74 'params': {
75 'skip_download': True,
eeb165e6
JMF
76 },
77 },
17ab4d3b
PH
78 {
79 # https://github.com/rg3/youtube-dl/issues/2253
80 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
81 'md5': '0ba9446db037002366bab3b3eb30c88c',
82 'info_dict': {
fd50bf62
S
83 'id': '3101154703001',
84 'ext': 'mp4',
17ab4d3b
PH
85 'title': 'Still no power',
86 'uploader': 'thestar.com',
87 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
88 },
89 'add_ie': ['Brightcove'],
90 },
0479c625
S
91 {
92 'url': 'http://www.championat.com/video/football/v/87/87499.html',
93 'md5': 'fb973ecf6e4a78a67453647444222983',
94 'info_dict': {
95 'id': '3414141473001',
96 'ext': 'mp4',
97 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
98 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
99 'uploader': 'Championat',
100 },
101 },
42393ce2
PH
102 # Direct link to a video
103 {
79649588 104 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
105 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
106 'info_dict': {
107 'id': 'trailer',
89ef304b 108 'ext': 'mp4',
79649588
PH
109 'title': 'trailer',
110 'upload_date': '20100513',
42393ce2 111 }
c0d0b01f
JMF
112 },
113 # ooyala video
114 {
79649588
PH
115 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
116 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
117 'info_dict': {
118 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
119 'ext': 'mp4',
3486df38 120 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f
JMF
121 },
122 },
89ef304b
PH
123 # google redirect
124 {
125 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
126 'info_dict': {
127 'id': 'cmQHVoWB5FY',
128 'ext': 'mp4',
129 'upload_date': '20130224',
130 'uploader_id': 'TheVerge',
131 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
132 'uploader': 'The Verge',
133 'title': 'First Firefox OS phones side-by-side',
134 },
135 'params': {
136 'skip_download': False,
137 }
f55a1f0a 138 },
1b86cc41 139 # embed.ly video
140 {
141 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
142 'info_dict': {
143 'id': '9ODmcdjQcHQ',
144 'ext': 'mp4',
0a5bce56
PH
145 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
146 'upload_date': '20140225',
147 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
148 'uploader': 'Tested',
149 'uploader_id': 'testedcom',
1b86cc41 150 },
151 # No need to test YoutubeIE here
152 'params': {
153 'skip_download': True,
154 },
155 },
60cc4dc4
PH
156 # funnyordie embed
157 {
158 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
159 'info_dict': {
160 'id': '18e820ec3f',
161 'ext': 'mp4',
162 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
163 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 164 },
60cc4dc4 165 },
93d020dd
S
166 # RUTV embed
167 {
168 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
169 'info_dict': {
170 'id': '776940',
171 'ext': 'mp4',
172 'title': 'Охотское море стало целиком российским',
173 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
174 },
175 'params': {
176 # m3u8 download
177 'skip_download': True,
178 },
aab74fa1
PH
179 },
180 # Embedded TED video
181 {
182 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 183 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 184 'info_dict': {
a8eb5a8e 185 'id': '1969',
aab74fa1 186 'ext': 'mp4',
a8eb5a8e
PH
187 'title': 'Hidden miracles of the natural world',
188 'uploader': 'Louie Schwartzberg',
189 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 190 }
60cc4dc4 191 },
5c386252 192 # Embeded Ustream video
193 {
194 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
195 'md5': '27b99cdb639c9b12a79bca876a073417',
196 'info_dict': {
ca6aada4 197 'id': '45734260',
198 'ext': 'flv',
199 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 200 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
201 }
202 },
d95e35d6
S
203 # nowvideo embed hidden behind percent encoding
204 {
205 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
206 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
207 'info_dict': {
208 'id': '06e53103ca9aa',
209 'ext': 'flv',
210 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
211 'description': 'No description',
212 },
0f2a2ba1 213 },
893f8832
PH
214 # arte embed
215 {
216 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
217 'md5': '7653032cbb25bf6c80d80f217055fa43',
218 'info_dict': {
219 'id': '048195-004_PLUS7-F',
220 'ext': 'flv',
221 'title': 'X:enius',
222 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
223 'upload_date': '20140320',
224 },
225 'params': {
226 'skip_download': 'Requires rtmpdump'
227 }
228 },
fa35cdad
PH
229 # Condé Nast embed
230 {
231 'url': 'http://www.wired.com/2014/04/honda-asimo/',
232 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
233 'info_dict': {
234 'id': '53501be369702d3275860000',
235 'ext': 'mp4',
236 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
237 }
ebd3c7b3
PH
238 },
239 # Dailymotion embed
240 {
241 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
242 'md5': '441aeeb82eb72c422c7f14ec533999cd',
243 'info_dict': {
244 'id': 'k2mm4bCdJ6CQ2i7c8o2',
245 'ext': 'mp4',
246 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
247 'uploader': 'Spi0n',
248 },
249 'add_ie': ['Dailymotion'],
2b88feed
PH
250 },
251 # YouTube embed
252 {
253 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
254 'info_dict': {
255 'id': 'FXRb4ykk4S0',
256 'ext': 'mp4',
257 'title': 'The NBL Auction 2014',
258 'uploader': 'BADMINTON England',
259 'uploader_id': 'BADMINTONEvents',
260 'upload_date': '20140603',
261 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
262 },
263 'add_ie': ['Youtube'],
264 'params': {
265 'skip_download': True,
266 }
267 },
c5cd249e
JMF
268 # MTVSercices embed
269 {
270 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
271 'md5': '35727f82f58c76d996fc188f9755b0d5',
272 'info_dict': {
273 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
274 'ext': 'mp4',
275 'title': 'Review',
276 'description': 'Mario\'s life in the fast lane has never looked so good.',
277 },
278 },
61013473 279 # YouTube embed via <data-embed-url="">
280 {
281 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 282 'info_dict': {
a8eb5a8e 283 'id': '4vAffPZIT44',
61013473 284 'ext': 'mp4',
a8eb5a8e 285 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
286 'uploader': 'Gameloft',
287 'uploader_id': 'gameloft',
a8eb5a8e
PH
288 'upload_date': '20140828',
289 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
290 },
291 'params': {
292 'skip_download': True,
61013473 293 }
c8e9a235
PH
294 },
295 # Camtasia studio
296 {
297 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
298 'playlist': [{
299 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
300 'info_dict': {
301 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
302 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
303 'ext': 'flv',
304 'duration': 2235.90,
305 }
306 }, {
307 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
308 'info_dict': {
309 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
310 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
311 'ext': 'flv',
312 'duration': 2235.93,
313 }
314 }],
315 'info_dict': {
316 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
317 }
4d805e06
PH
318 },
319 # Flowplayer
320 {
321 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
322 'md5': '9d65602bf31c6e20014319c7d07fba27',
323 'info_dict': {
324 'id': '5123ea6d5e5a7',
325 'ext': 'mp4',
326 'age_limit': 18,
327 'uploader': 'www.handjobhub.com',
328 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
329 }
0990305d
PH
330 },
331 # RSS feed
332 {
333 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
334 'info_dict': {
335 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
336 'title': 'Zero Punctuation',
337 'description': 're:'
338 },
339 'playlist_mincount': 11,
22a6f150
PH
340 },
341 # Multiple brightcove videos
342 # https://github.com/rg3/youtube-dl/issues/2283
343 {
344 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
345 'info_dict': {
346 'id': 'always-never',
347 'title': 'Always / Never - The New Yorker',
348 },
349 'playlist_count': 3,
350 'params': {
351 'extract_flat': False,
352 'skip_download': True,
353 }
1a94ff68
S
354 },
355 # MLB embed
356 {
357 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
358 'md5': '96f09a37e44da40dd083e12d9a683327',
359 'info_dict': {
360 'id': '33322633',
361 'ext': 'mp4',
362 'title': 'Ump changes call to ball',
363 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
364 'duration': 48,
365 'timestamp': 1401537900,
366 'upload_date': '20140531',
367 'thumbnail': 're:^https?://.*\.jpg$',
368 },
369 },
746c67d7
NJ
370 # Wistia embed
371 {
372 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
373 'md5': '8788b683c777a5cf25621eaf286d0c23',
374 'info_dict': {
375 'id': '1cfaf6b7ea',
376 'ext': 'mov',
377 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
378 'duration': 643.0,
379 'filesize': 182808282,
380 'uploader': 'education-portal.com',
381 },
382 },
52cffcb1 383 {
384 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
385 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
386 'info_dict': {
387 'id': 'uxjb0lwrcz',
388 'ext': 'mp4',
85d7b765 389 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 390 'duration': 1715.0,
85d7b765 391 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 392 },
52cffcb1 393 },
70b7e3fb
PH
394 # Direct download with broken HEAD
395 {
396 'url': 'http://ai-radio.org:8000/radio.opus',
397 'info_dict': {
398 'id': 'radio',
399 'ext': 'opus',
400 'title': 'radio',
401 },
402 'params': {
403 'skip_download': True, # infinite live stream
404 },
405 'expected_warnings': [
406 r'501.*Not Implemented'
407 ],
408 }
cfe50f04 409 ]
9b122384 410
9b122384
PH
411 def report_following_redirect(self, new_url):
412 """Report information extraction."""
79649588 413 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 414
4fc946b5
PH
415 def _extract_rss(self, url, video_id, doc):
416 playlist_title = doc.find('./channel/title').text
417 playlist_desc_el = doc.find('./channel/description')
418 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
419
420 entries = [{
421 '_type': 'url',
422 'url': e.find('link').text,
423 'title': e.find('title').text,
424 } for e in doc.findall('./channel/item')]
425
426 return {
427 '_type': 'playlist',
428 'id': url,
429 'title': playlist_title,
430 'description': playlist_desc,
431 'entries': entries,
432 }
433
c8e9a235
PH
434 def _extract_camtasia(self, url, video_id, webpage):
435 """ Returns None if no camtasia video can be found. """
436
437 camtasia_cfg = self._search_regex(
438 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
439 webpage, 'camtasia configuration file', default=None)
440 if camtasia_cfg is None:
441 return None
442
443 title = self._html_search_meta('DC.title', webpage, fatal=True)
444
445 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
446 camtasia_cfg = self._download_xml(
447 camtasia_url, video_id,
448 note='Downloading camtasia configuration',
449 errnote='Failed to download camtasia configuration')
450 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
451
452 entries = []
453 for n in fileset_node.getchildren():
454 url_n = n.find('./uri')
455 if url_n is None:
456 continue
457
458 entries.append({
459 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
460 'title': '%s - %s' % (title, n.tag),
461 'url': compat_urlparse.urljoin(url, url_n.text),
462 'duration': float_or_none(n.find('./duration').text),
463 })
464
465 return {
466 '_type': 'playlist',
467 'entries': entries,
468 'title': title,
469 }
470
9b122384 471 def _real_extract(self, url):
ebd3c7b3
PH
472 if url.startswith('//'):
473 return {
474 '_type': 'url',
20991253 475 'url': self.http_scheme() + url,
ebd3c7b3
PH
476 }
477
a7130543
JMF
478 parsed_url = compat_urlparse.urlparse(url)
479 if not parsed_url.scheme:
04b4d394
PH
480 default_search = self._downloader.params.get('default_search')
481 if default_search is None:
1f7ccb90 482 default_search = 'fixup_error'
04b4d394 483
1f7ccb90 484 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
485 if '/' in url:
486 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
487 return self.url_result('http://' + url)
1f7ccb90 488 elif default_search != 'fixup_error':
9c1fc022 489 if default_search == 'auto_warning':
0e67ab0d
PH
490 if re.match(r'^(?:url|URL)$', url):
491 raise ExtractorError(
492 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
493 expected=True)
494 else:
495 self._downloader.report_warning(
7571c02c 496 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 497 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
498
499 if default_search in ('error', 'fixup_error'):
7571c02c
PH
500 raise ExtractorError(
501 ('%r is not a valid URL. '
eef4a7a3 502 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
7571c02c 503 ) % (url, url), expected=True)
04b4d394 504 else:
f2f2c0c2
PH
505 if ':' not in default_search:
506 default_search += ':'
04b4d394 507 return self.url_result(default_search + url)
4d54ef20
PH
508
509 url, smuggled_data = unsmuggle_url(url)
510 force_videoid = None
d6e6a422 511 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
512 if smuggled_data and 'force_videoid' in smuggled_data:
513 force_videoid = smuggled_data['force_videoid']
514 video_id = force_videoid
515 else:
516 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 517
79649588 518 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 519
ebab4520 520 head_req = HEADRequest(url)
23be51d8 521 head_response = self._request_webpage(
ebab4520
PH
522 head_req, video_id,
523 note=False, errnote='Could not send HEAD request to %s' % url,
524 fatal=False)
42393ce2 525
23be51d8 526 if head_response is not False:
42393ce2 527 # Check for redirect
23be51d8 528 new_url = head_response.geturl()
42393ce2
PH
529 if url != new_url:
530 self.report_following_redirect(new_url)
4d54ef20
PH
531 if force_videoid:
532 new_url = smuggle_url(
533 new_url, {'force_videoid': force_videoid})
cecaaf3f 534 return self.url_result(new_url)
42393ce2 535
23be51d8
PH
536 full_response = None
537 if head_response is False:
538 full_response = self._request_webpage(url, video_id)
539 head_response = full_response
540
541 # Check for direct link to a video
542 content_type = head_response.headers.get('Content-Type', '')
543 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
544 if m:
545 upload_date = unified_strdate(
546 head_response.headers.get('Last-Modified'))
547 return {
548 'id': video_id,
549 'title': os.path.splitext(url_basename(url))[0],
550 'formats': [{
551 'format_id': m.group('format_id'),
552 'url': url,
553 'vcodec': 'none' if m.group('type') == 'audio' else None
554 }],
555 'upload_date': upload_date,
556 }
42393ce2 557
d6e6a422
PH
558 if not self._downloader.params.get('test', False) and not is_intentional:
559 self._downloader.report_warning('Falling back on generic information extractor.')
560
23be51d8 561 if full_response:
37d66e7f 562 webpage = self._webpage_read_content(full_response, url, video_id)
23be51d8 563 else:
9b122384 564 webpage = self._download_webpage(url, video_id)
9b122384 565 self.report_extraction(video_id)
887c6acd 566
4fc946b5
PH
567 # Is it an RSS feed?
568 try:
bcf89ce6 569 doc = parse_xml(webpage)
4fc946b5
PH
570 if doc.tag == 'rss':
571 return self._extract_rss(url, video_id, doc)
f7300c5c 572 except compat_xml_parse_error:
4fc946b5
PH
573 pass
574
c8e9a235
PH
575 # Is it a Camtasia project?
576 camtasia_res = self._extract_camtasia(url, video_id, webpage)
577 if camtasia_res is not None:
578 return camtasia_res
579
14390730
S
580 # Sometimes embedded video player is hidden behind percent encoding
581 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
582 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
583 webpage = compat_urllib_parse.unquote(webpage)
584
887c6acd
PH
585 # it's tempting to parse this further, but you would
586 # have to take into account all the variations like
587 # Video Title - Site Name
588 # Site Name | Video Title
589 # Video Title - Tagline | Site Name
590 # and so on and so forth; it's just not practical
ef4fd848 591 video_title = self._html_search_regex(
79649588
PH
592 r'(?s)<title>(.*?)</title>', webpage, 'video title',
593 default='video')
ef4fd848 594
4d805e06
PH
595 # Try to detect age limit automatically
596 age_limit = self._rta_search(webpage)
597 # And then there are the jokers who advertise that they use RTA,
598 # but actually don't.
599 AGE_LIMIT_MARKERS = [
600 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
601 ]
602 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
603 age_limit = 18
604
ef4fd848
PH
605 # video uploader is domain name
606 video_uploader = self._search_regex(
79649588 607 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 608
ed2d6a19
PH
609 # Helper method
610 def _playlist_from_matches(matches, getter, ie=None):
3b2f933b
PH
611 urlrs = orderedSet(
612 self.url_result(self._proto_relative_url(getter(m)), ie)
613 for m in matches)
ed2d6a19
PH
614 return self.playlist_result(
615 urlrs, playlist_id=video_id, playlist_title=video_title)
616
627a91a9 617 # Look for BrightCove:
99877772
PH
618 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
619 if bc_urls:
79649588 620 self.to_screen('Brightcove video detected.')
99877772
PH
621 entries = [{
622 '_type': 'url',
623 'url': smuggle_url(bc_url, {'Referer': url}),
624 'ie_key': 'Brightcove'
625 } for bc_url in bc_urls]
626
627 return {
628 '_type': 'playlist',
629 'title': video_title,
630 'id': video_id,
631 'entries': entries,
632 }
cfe50f04 633
7115ca84 634 # Look for embedded (iframe) Vimeo player
9d4660ca 635 mobj = re.search(
15fd51b3 636 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 637 if mobj:
15fd51b3 638 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 639 surl = smuggle_url(player_url, {'Referer': url})
09a42738 640 return self.url_result(surl)
9d4660ca 641
7115ca84
PH
642 # Look for embedded (swf embed) Vimeo player
643 mobj = re.search(
09a42738 644 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 645 if mobj:
09a42738 646 return self.url_result(mobj.group(1))
7115ca84 647
53c1d3ef 648 # Look for embedded YouTube player
1f9da904 649 matches = re.findall(r'''(?x)
2b88feed
PH
650 (?:
651 <iframe[^>]+?src=|
c71dfccc 652 data-video-url=|
2b88feed 653 <embed[^>]+?src=|
a7e97f6d
PH
654 embedSWF\(?:\s*|
655 new\s+SWFObject\(
2b88feed
PH
656 )
657 (["\'])
1bf5423e 658 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 659 (?:embed|v|p)/.+?)
1f9da904 660 \1''', webpage)
887c6acd 661 if matches:
ed2d6a19 662 return _playlist_from_matches(
3b2f933b 663 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 664
355e4fd0
PH
665 # Look for embedded Dailymotion player
666 matches = re.findall(
ef4fd848 667 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 668 if matches:
ed2d6a19
PH
669 return _playlist_from_matches(
670 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 671
8489578d
NJ
672 # Look for embedded Dailymotion playlist player (#3822)
673 m = re.search(
674 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
675 if m:
676 playlists = re.findall(
677 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
678 if playlists:
679 return _playlist_from_matches(
680 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
681
ef4fd848
PH
682 # Look for embedded Wistia player
683 match = re.search(
281d3f1d 684 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 685 if match:
9471c444
NJ
686 embed_url = self._proto_relative_url(
687 unescapeHTML(match.group('url')))
ef4fd848
PH
688 return {
689 '_type': 'url_transparent',
9471c444 690 'url': embed_url,
ef4fd848
PH
691 'ie_key': 'Wistia',
692 'uploader': video_uploader,
693 'title': video_title,
694 'id': video_id,
695 }
52cffcb1 696
9471c444 697 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
698 if match:
699 return {
700 '_type': 'url_transparent',
701 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
702 'ie_key': 'Wistia',
703 'uploader': video_uploader,
704 'title': video_title,
705 'id': match.group('id')
706 }
ef4fd848 707
ee3e63e4 708 # Look for embedded blip.tv player
19dab5e6 709 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 710 if mobj:
19dab5e6 711 return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
1f8b6af7 712 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 713 if mobj:
19dab5e6 714 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 715
fa35cdad
PH
716 # Look for embedded condenast player
717 matches = re.findall(
718 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
719 webpage)
720 if matches:
721 return {
722 '_type': 'playlist',
723 'entries': [{
724 '_type': 'url',
725 'ie_key': 'CondeNast',
726 'url': ma,
727 } for ma in matches],
728 'title': video_title,
729 'id': video_id,
730 }
731
c19f7764
JMF
732 # Look for Bandcamp pages with custom domain
733 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
734 if mobj is not None:
735 burl = unescapeHTML(mobj.group(1))
09804265
JMF
736 # Don't set the extractor because it can be a track url or an album
737 return self.url_result(burl)
c19f7764 738
f25571ff
PH
739 # Look for embedded Vevo player
740 mobj = re.search(
741 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
742 if mobj is not None:
743 return self.url_result(mobj.group('url'))
744
c0d0b01f 745 # Look for Ooyala videos
750f9020
JMF
746 mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
747 re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 748 if mobj is not None:
750f9020 749 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 750
aa94a6d3 751 # Look for Aparat videos
48099643 752 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
753 if mobj is not None:
754 return self.url_result(mobj.group(1), 'Aparat')
755
c93c2ab1 756 # Look for MPORA videos
c3f51436 757 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
758 if mobj is not None:
759 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 760
15c0e8e7 761 # Look for embedded NovaMov-based player
8f89e687 762 mobj = re.search(
8dfa187b 763 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
764 (?P<url>http://(?:(?:embed|www)\.)?
765 (?:novamov\.com|
766 nowvideo\.(?:ch|sx|eu|at|ag|co)|
767 videoweed\.(?:es|com)|
768 movshare\.(?:net|sx|ag)|
769 divxstage\.(?:eu|net|ch|co|at|ag))
770 /embed\.php.+?)\1''', webpage)
8f89e687 771 if mobj is not None:
15c0e8e7 772 return self.url_result(mobj.group('url'))
50f56607 773
9834872b
PH
774 # Look for embedded Facebook player
775 mobj = re.search(
db1f3888 776 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
777 if mobj is not None:
778 return self.url_result(mobj.group('url'), 'Facebook')
779
ca97a56e
S
780 # Look for embedded VK player
781 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
782 if mobj is not None:
783 return self.url_result(mobj.group('url'), 'VK')
784
0364fa8b
S
785 # Look for embedded ivi player
786 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
787 if mobj is not None:
788 return self.url_result(mobj.group('url'), 'Ivi')
789
db1f3888
PH
790 # Look for embedded Huffington Post player
791 mobj = re.search(
c3f51436 792 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
793 if mobj is not None:
794 return self.url_result(mobj.group('url'), 'HuffPost')
795
1b86cc41 796 # Look for embed.ly
797 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
798 if mobj is not None:
799 return self.url_result(mobj.group('url'))
800 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
801 if mobj is not None:
802 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
803
60cc4dc4
PH
804 # Look for funnyordie embed
805 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
806 if matches:
ed2d6a19
PH
807 return _playlist_from_matches(
808 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 809
93d020dd
S
810 # Look for embedded RUTV player
811 rutv_url = RUTVIE._extract_url(webpage)
812 if rutv_url:
813 return self.url_result(rutv_url, 'RUTV')
814
7e2ede98
JMF
815 # Look for embedded TED player
816 mobj = re.search(
817 r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
818 if mobj is not None:
819 return self.url_result(mobj.group('url'), 'TED')
820
5c386252 821 # Look for embedded Ustream videos
822 mobj = re.search(
823 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
824 if mobj is not None:
825 return self.url_result(mobj.group('url'), 'Ustream')
826
893f8832
PH
827 # Look for embedded arte.tv player
828 mobj = re.search(
829 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
830 webpage)
831 if mobj is not None:
832 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
833
cb3ac1c6
S
834 # Look for embedded smotri.com player
835 smotri_url = SmotriIE._extract_url(webpage)
836 if smotri_url:
837 return self.url_result(smotri_url, 'Smotri')
838
20991253
PH
839 # Look for embeded soundcloud player
840 mobj = re.search(
841 r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
842 webpage)
843 if mobj is not None:
844 url = unescapeHTML(mobj.group('url'))
845 return self.url_result(url)
846
826ec77f
PH
847 # Look for embedded vulture.com player
848 mobj = re.search(
849 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
850 webpage)
851 if mobj is not None:
852 url = unescapeHTML(mobj.group('url'))
853 return self.url_result(url, ie='Vulture')
854
c5cd249e
JMF
855 # Look for embedded mtvservices player
856 mobj = re.search(
857 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
858 webpage)
859 if mobj is not None:
860 url = unescapeHTML(mobj.group('url'))
861 return self.url_result(url, ie='MTVServicesEmbedded')
862
49807b4a
S
863 # Look for embedded yahoo player
864 mobj = re.search(
865 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
866 webpage)
867 if mobj is not None:
868 return self.url_result(mobj.group('url'), 'Yahoo')
869
2ef6fcb5
PH
870 # Look for embedded sbs.com.au player
871 mobj = re.search(
872 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
873 webpage)
874 if mobj is not None:
875 return self.url_result(mobj.group('url'), 'SBS')
876
1a94ff68
S
877 mobj = re.search(
878 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
879 webpage)
880 if mobj is not None:
881 return self.url_result(mobj.group('url'), 'MLB')
882
1419fafd
S
883 mobj = re.search(
884 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
885 webpage)
886 if mobj is not None:
887 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
888
ced659bb
S
889 def check_video(vurl):
890 vpath = compat_urlparse.urlparse(vurl).path
891 vext = determine_ext(vpath)
892 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
893
894 def filter_video(urls):
895 return list(filter(check_video, urls))
896
9b122384 897 # Start with something easy: JW Player in SWFObject
ced659bb 898 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 899 if not found:
d981cef6 900 # Look for gorilla-vid style embedding
ced659bb 901 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
902 (?:
903 jw_plugins|
904 JWPlayerOptions|
905 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
906 )
ced659bb 907 .*?file\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 908 if not found:
9b122384 909 # Broaden the search a little bit
ced659bb 910 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
911 if not found:
912 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
913 found = filter_video(re.findall(
914 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
915 if not found:
916 # Flow player
ced659bb 917 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
918 flowplayer\("[^"]+",\s*
919 \{[^}]+?\}\s*,
920 \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
921 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 922 ''', webpage))
b30b8698 923 if not found:
9b122384 924 # Try to find twitter cards info
ced659bb
S
925 found = filter_video(re.findall(
926 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 927 if not found:
9b122384
PH
928 # We look for Open Graph info:
929 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 930 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
931 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
932 if m_video_type is not None:
ced659bb 933 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 934 if not found:
7fea7156 935 # HTML5 video
aff216ed 936 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
b30b8698 937 if not found:
a5a45015 938 found = re.search(
89ef304b 939 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 940 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 941 webpage)
b30b8698
PH
942 if found:
943 new_url = found.group(1)
89ef304b
PH
944 self.report_following_redirect(new_url)
945 return {
946 '_type': 'url',
947 'url': new_url,
948 }
b30b8698 949 if not found:
79649588 950 raise ExtractorError('Unsupported URL: %s' % url)
9b122384 951
b30b8698
PH
952 entries = []
953 for video_url in found:
954 video_url = compat_urlparse.urljoin(url, video_url)
955 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 956
b30b8698
PH
957 # Sometimes, jwplayer extraction will result in a YouTube URL
958 if YoutubeIE.suitable(video_url):
959 entries.append(self.url_result(video_url, 'Youtube'))
960 continue
9b122384 961
b30b8698
PH
962 # here's a fun little line of code for you:
963 video_id = os.path.splitext(video_id)[0]
fc9713a1 964
b30b8698
PH
965 entries.append({
966 'id': video_id,
967 'url': video_url,
968 'uploader': video_uploader,
969 'title': video_title,
4d805e06 970 'age_limit': age_limit,
b30b8698
PH
971 })
972
973 if len(entries) == 1:
669f0e7c 974 return entries[0]
b30b8698
PH
975 else:
976 for num, e in enumerate(entries, start=1):
977 e['title'] = '%s (%d)' % (e['title'], num)
978 return {
979 '_type': 'playlist',
980 'entries': entries,
981 }
9b122384 982