]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
release 2014.11.25.1
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
ed2d6a19 20 orderedSet,
bcf89ce6 21 parse_xml,
9d4660ca
PH
22 smuggle_url,
23 unescapeHTML,
42393ce2 24 unified_strdate,
4d54ef20 25 unsmuggle_url,
42393ce2 26 url_basename,
9b122384 27)
cfe50f04 28from .brightcove import BrightcoveIE
c0d0b01f 29from .ooyala import OoyalaIE
93d020dd 30from .rutv import RUTVIE
cb3ac1c6 31from .smotri import SmotriIE
1419fafd 32from .condenast import CondeNastIE
9b122384 33
0838239e 34
9b122384 35class GenericIE(InfoExtractor):
79649588 36 IE_DESC = 'Generic downloader that works on some sites'
9b122384 37 _VALID_URL = r'.*'
79649588 38 IE_NAME = 'generic'
cfe50f04
JMF
39 _TESTS = [
40 {
79649588 41 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 42 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 43 'info_dict': {
d360a146
S
44 'id': '13601338388002',
45 'ext': 'mp4',
79649588
PH
46 'uploader': 'www.hodiho.fr',
47 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
48 }
49 },
c19f7764
JMF
50 # bandcamp page with custom domain
51 {
79649588
PH
52 'add_ie': ['Bandcamp'],
53 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 54 'info_dict': {
fd50bf62
S
55 'id': '3235767654',
56 'ext': 'mp3',
79649588
PH
57 'title': 'The Pony Mash',
58 'uploader': 'M_Pallante',
c19f7764 59 },
79649588 60 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 61 },
eeb165e6 62 # embedded brightcove video
dd5bcdc4
JMF
63 # it also tests brightcove videos that need to set the 'Referer' in the
64 # http requests
eeb165e6 65 {
79649588
PH
66 'add_ie': ['Brightcove'],
67 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
68 'info_dict': {
69 'id': '2765128793001',
70 'ext': 'mp4',
71 'title': 'Le cours de bourse : l’analyse technique',
72 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
73 'uploader': 'BFM BUSINESS',
eeb165e6 74 },
79649588
PH
75 'params': {
76 'skip_download': True,
eeb165e6
JMF
77 },
78 },
17ab4d3b
PH
79 {
80 # https://github.com/rg3/youtube-dl/issues/2253
81 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
82 'md5': '0ba9446db037002366bab3b3eb30c88c',
83 'info_dict': {
fd50bf62
S
84 'id': '3101154703001',
85 'ext': 'mp4',
17ab4d3b
PH
86 'title': 'Still no power',
87 'uploader': 'thestar.com',
88 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
89 },
90 'add_ie': ['Brightcove'],
91 },
0479c625
S
92 {
93 'url': 'http://www.championat.com/video/football/v/87/87499.html',
94 'md5': 'fb973ecf6e4a78a67453647444222983',
95 'info_dict': {
96 'id': '3414141473001',
97 'ext': 'mp4',
98 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
99 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
100 'uploader': 'Championat',
101 },
102 },
bdf97017 103 {
37aab278 104 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
105 'add_ie': ['Brightcove'],
106 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
107 'info_dict': {
108 'id': '3866516442001',
37aab278 109 'ext': 'mp4',
bdf97017
NJ
110 'title': 'Leer mij vrouwen kennen: Aflevering 1',
111 'description': 'Leer mij vrouwen kennen: Aflevering 1',
112 'uploader': 'SBS Broadcasting',
113 },
37aab278 114 'skip': 'Restricted to Netherlands',
bdf97017 115 'params': {
37aab278 116 'skip_download': True, # m3u8 download
bdf97017
NJ
117 },
118 },
42393ce2
PH
119 # Direct link to a video
120 {
79649588 121 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
122 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
123 'info_dict': {
124 'id': 'trailer',
89ef304b 125 'ext': 'mp4',
79649588
PH
126 'title': 'trailer',
127 'upload_date': '20100513',
42393ce2 128 }
c0d0b01f
JMF
129 },
130 # ooyala video
131 {
79649588
PH
132 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
133 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
134 'info_dict': {
135 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
136 'ext': 'mp4',
3486df38 137 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f
JMF
138 },
139 },
89ef304b
PH
140 # google redirect
141 {
142 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
143 'info_dict': {
144 'id': 'cmQHVoWB5FY',
145 'ext': 'mp4',
146 'upload_date': '20130224',
147 'uploader_id': 'TheVerge',
148 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
149 'uploader': 'The Verge',
150 'title': 'First Firefox OS phones side-by-side',
151 },
152 'params': {
153 'skip_download': False,
154 }
f55a1f0a 155 },
1b86cc41 156 # embed.ly video
157 {
158 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
159 'info_dict': {
160 'id': '9ODmcdjQcHQ',
161 'ext': 'mp4',
0a5bce56
PH
162 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
163 'upload_date': '20140225',
164 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
165 'uploader': 'Tested',
166 'uploader_id': 'testedcom',
1b86cc41 167 },
168 # No need to test YoutubeIE here
169 'params': {
170 'skip_download': True,
171 },
172 },
60cc4dc4
PH
173 # funnyordie embed
174 {
175 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
176 'info_dict': {
177 'id': '18e820ec3f',
178 'ext': 'mp4',
179 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
180 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 181 },
60cc4dc4 182 },
93d020dd
S
183 # RUTV embed
184 {
185 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
186 'info_dict': {
187 'id': '776940',
188 'ext': 'mp4',
189 'title': 'Охотское море стало целиком российским',
190 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
191 },
192 'params': {
193 # m3u8 download
194 'skip_download': True,
195 },
aab74fa1
PH
196 },
197 # Embedded TED video
198 {
199 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 200 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 201 'info_dict': {
a8eb5a8e 202 'id': '1969',
aab74fa1 203 'ext': 'mp4',
a8eb5a8e
PH
204 'title': 'Hidden miracles of the natural world',
205 'uploader': 'Louie Schwartzberg',
206 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 207 }
60cc4dc4 208 },
5c386252 209 # Embeded Ustream video
210 {
211 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
212 'md5': '27b99cdb639c9b12a79bca876a073417',
213 'info_dict': {
ca6aada4 214 'id': '45734260',
215 'ext': 'flv',
216 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 217 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
218 }
219 },
d95e35d6
S
220 # nowvideo embed hidden behind percent encoding
221 {
222 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
223 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
224 'info_dict': {
225 'id': '06e53103ca9aa',
226 'ext': 'flv',
227 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
228 'description': 'No description',
229 },
0f2a2ba1 230 },
893f8832
PH
231 # arte embed
232 {
233 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
234 'md5': '7653032cbb25bf6c80d80f217055fa43',
235 'info_dict': {
236 'id': '048195-004_PLUS7-F',
237 'ext': 'flv',
238 'title': 'X:enius',
239 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
240 'upload_date': '20140320',
241 },
242 'params': {
243 'skip_download': 'Requires rtmpdump'
244 }
245 },
fa35cdad
PH
246 # Condé Nast embed
247 {
248 'url': 'http://www.wired.com/2014/04/honda-asimo/',
249 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
250 'info_dict': {
251 'id': '53501be369702d3275860000',
252 'ext': 'mp4',
253 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
254 }
ebd3c7b3
PH
255 },
256 # Dailymotion embed
257 {
258 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
259 'md5': '441aeeb82eb72c422c7f14ec533999cd',
260 'info_dict': {
261 'id': 'k2mm4bCdJ6CQ2i7c8o2',
262 'ext': 'mp4',
263 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
264 'uploader': 'Spi0n',
265 },
266 'add_ie': ['Dailymotion'],
2b88feed
PH
267 },
268 # YouTube embed
269 {
270 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
271 'info_dict': {
272 'id': 'FXRb4ykk4S0',
273 'ext': 'mp4',
274 'title': 'The NBL Auction 2014',
275 'uploader': 'BADMINTON England',
276 'uploader_id': 'BADMINTONEvents',
277 'upload_date': '20140603',
278 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
279 },
280 'add_ie': ['Youtube'],
281 'params': {
282 'skip_download': True,
283 }
284 },
c5cd249e
JMF
285 # MTVSercices embed
286 {
287 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
288 'md5': '35727f82f58c76d996fc188f9755b0d5',
289 'info_dict': {
290 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
291 'ext': 'mp4',
292 'title': 'Review',
293 'description': 'Mario\'s life in the fast lane has never looked so good.',
294 },
295 },
61013473 296 # YouTube embed via <data-embed-url="">
297 {
298 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 299 'info_dict': {
a8eb5a8e 300 'id': '4vAffPZIT44',
61013473 301 'ext': 'mp4',
a8eb5a8e 302 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
303 'uploader': 'Gameloft',
304 'uploader_id': 'gameloft',
a8eb5a8e
PH
305 'upload_date': '20140828',
306 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
307 },
308 'params': {
309 'skip_download': True,
61013473 310 }
c8e9a235
PH
311 },
312 # Camtasia studio
313 {
314 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
315 'playlist': [{
316 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
317 'info_dict': {
318 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
319 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
320 'ext': 'flv',
321 'duration': 2235.90,
322 }
323 }, {
324 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
325 'info_dict': {
326 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
327 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
328 'ext': 'flv',
329 'duration': 2235.93,
330 }
331 }],
332 'info_dict': {
333 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
334 }
4d805e06
PH
335 },
336 # Flowplayer
337 {
338 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
339 'md5': '9d65602bf31c6e20014319c7d07fba27',
340 'info_dict': {
341 'id': '5123ea6d5e5a7',
342 'ext': 'mp4',
343 'age_limit': 18,
344 'uploader': 'www.handjobhub.com',
d6d9186f 345 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 346 }
0990305d
PH
347 },
348 # RSS feed
349 {
350 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
351 'info_dict': {
352 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
353 'title': 'Zero Punctuation',
354 'description': 're:'
355 },
356 'playlist_mincount': 11,
22a6f150
PH
357 },
358 # Multiple brightcove videos
359 # https://github.com/rg3/youtube-dl/issues/2283
360 {
361 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
362 'info_dict': {
363 'id': 'always-never',
364 'title': 'Always / Never - The New Yorker',
365 },
366 'playlist_count': 3,
367 'params': {
368 'extract_flat': False,
369 'skip_download': True,
370 }
1a94ff68
S
371 },
372 # MLB embed
373 {
374 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
375 'md5': '96f09a37e44da40dd083e12d9a683327',
376 'info_dict': {
377 'id': '33322633',
378 'ext': 'mp4',
379 'title': 'Ump changes call to ball',
380 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
381 'duration': 48,
382 'timestamp': 1401537900,
383 'upload_date': '20140531',
384 'thumbnail': 're:^https?://.*\.jpg$',
385 },
386 },
746c67d7
NJ
387 # Wistia embed
388 {
389 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
390 'md5': '8788b683c777a5cf25621eaf286d0c23',
391 'info_dict': {
392 'id': '1cfaf6b7ea',
393 'ext': 'mov',
394 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
395 'duration': 643.0,
396 'filesize': 182808282,
397 'uploader': 'education-portal.com',
398 },
399 },
52cffcb1 400 {
401 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
402 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
403 'info_dict': {
404 'id': 'uxjb0lwrcz',
405 'ext': 'mp4',
85d7b765 406 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 407 'duration': 1715.0,
85d7b765 408 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 409 },
52cffcb1 410 },
70b7e3fb
PH
411 # Direct download with broken HEAD
412 {
413 'url': 'http://ai-radio.org:8000/radio.opus',
414 'info_dict': {
415 'id': 'radio',
416 'ext': 'opus',
417 'title': 'radio',
418 },
419 'params': {
420 'skip_download': True, # infinite live stream
421 },
422 'expected_warnings': [
423 r'501.*Not Implemented'
424 ],
ac645ac7
PH
425 },
426 # Soundcloud embed
427 {
428 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
429 'info_dict': {
430 'id': '174391317',
431 'ext': 'mp3',
432 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
433 'uploader': 'Sophos Security',
434 'title': 'Chet Chat 171 - Oct 29, 2014',
435 'upload_date': '20141029',
436 }
af63fed7
PH
437 },
438 # Livestream embed
439 {
440 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
441 'info_dict': {
442 'id': '67864563',
443 'ext': 'flv',
444 'upload_date': '20141112',
445 'title': 'Rosetta #CometLanding webcast HL 10',
446 }
447 },
65f3a228
PH
448 # LazyYT
449 {
450 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
451 'info_dict': {
452 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
453 },
454 'playlist_mincount': 2,
455 }
cfe50f04 456 ]
9b122384 457
9b122384
PH
458 def report_following_redirect(self, new_url):
459 """Report information extraction."""
79649588 460 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 461
4fc946b5
PH
462 def _extract_rss(self, url, video_id, doc):
463 playlist_title = doc.find('./channel/title').text
464 playlist_desc_el = doc.find('./channel/description')
465 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
466
467 entries = [{
468 '_type': 'url',
469 'url': e.find('link').text,
470 'title': e.find('title').text,
471 } for e in doc.findall('./channel/item')]
472
473 return {
474 '_type': 'playlist',
475 'id': url,
476 'title': playlist_title,
477 'description': playlist_desc,
478 'entries': entries,
479 }
480
c8e9a235
PH
481 def _extract_camtasia(self, url, video_id, webpage):
482 """ Returns None if no camtasia video can be found. """
483
484 camtasia_cfg = self._search_regex(
485 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
486 webpage, 'camtasia configuration file', default=None)
487 if camtasia_cfg is None:
488 return None
489
490 title = self._html_search_meta('DC.title', webpage, fatal=True)
491
492 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
493 camtasia_cfg = self._download_xml(
494 camtasia_url, video_id,
495 note='Downloading camtasia configuration',
496 errnote='Failed to download camtasia configuration')
497 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
498
499 entries = []
500 for n in fileset_node.getchildren():
501 url_n = n.find('./uri')
502 if url_n is None:
503 continue
504
505 entries.append({
506 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
507 'title': '%s - %s' % (title, n.tag),
508 'url': compat_urlparse.urljoin(url, url_n.text),
509 'duration': float_or_none(n.find('./duration').text),
510 })
511
512 return {
513 '_type': 'playlist',
514 'entries': entries,
515 'title': title,
516 }
517
9b122384 518 def _real_extract(self, url):
ebd3c7b3
PH
519 if url.startswith('//'):
520 return {
521 '_type': 'url',
20991253 522 'url': self.http_scheme() + url,
ebd3c7b3
PH
523 }
524
a7130543
JMF
525 parsed_url = compat_urlparse.urlparse(url)
526 if not parsed_url.scheme:
04b4d394
PH
527 default_search = self._downloader.params.get('default_search')
528 if default_search is None:
1f7ccb90 529 default_search = 'fixup_error'
04b4d394 530
1f7ccb90 531 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
532 if '/' in url:
533 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
534 return self.url_result('http://' + url)
1f7ccb90 535 elif default_search != 'fixup_error':
9c1fc022 536 if default_search == 'auto_warning':
0e67ab0d
PH
537 if re.match(r'^(?:url|URL)$', url):
538 raise ExtractorError(
539 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
540 expected=True)
541 else:
542 self._downloader.report_warning(
7571c02c 543 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 544 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
545
546 if default_search in ('error', 'fixup_error'):
7571c02c 547 raise ExtractorError(
b74e86f4
PH
548 '%r is not a valid URL. '
549 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
550 % (url, url), expected=True)
04b4d394 551 else:
f2f2c0c2
PH
552 if ':' not in default_search:
553 default_search += ':'
04b4d394 554 return self.url_result(default_search + url)
4d54ef20
PH
555
556 url, smuggled_data = unsmuggle_url(url)
557 force_videoid = None
d6e6a422 558 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
559 if smuggled_data and 'force_videoid' in smuggled_data:
560 force_videoid = smuggled_data['force_videoid']
561 video_id = force_videoid
562 else:
563 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 564
79649588 565 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 566
ebab4520 567 head_req = HEADRequest(url)
23be51d8 568 head_response = self._request_webpage(
ebab4520
PH
569 head_req, video_id,
570 note=False, errnote='Could not send HEAD request to %s' % url,
571 fatal=False)
42393ce2 572
23be51d8 573 if head_response is not False:
42393ce2 574 # Check for redirect
23be51d8 575 new_url = head_response.geturl()
42393ce2
PH
576 if url != new_url:
577 self.report_following_redirect(new_url)
4d54ef20
PH
578 if force_videoid:
579 new_url = smuggle_url(
580 new_url, {'force_videoid': force_videoid})
cecaaf3f 581 return self.url_result(new_url)
42393ce2 582
23be51d8
PH
583 full_response = None
584 if head_response is False:
585 full_response = self._request_webpage(url, video_id)
586 head_response = full_response
587
588 # Check for direct link to a video
589 content_type = head_response.headers.get('Content-Type', '')
590 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
591 if m:
592 upload_date = unified_strdate(
593 head_response.headers.get('Last-Modified'))
594 return {
595 'id': video_id,
596 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 597 'direct': True,
23be51d8
PH
598 'formats': [{
599 'format_id': m.group('format_id'),
600 'url': url,
601 'vcodec': 'none' if m.group('type') == 'audio' else None
602 }],
603 'upload_date': upload_date,
604 }
42393ce2 605
d6e6a422
PH
606 if not self._downloader.params.get('test', False) and not is_intentional:
607 self._downloader.report_warning('Falling back on generic information extractor.')
608
23be51d8 609 if full_response:
37d66e7f 610 webpage = self._webpage_read_content(full_response, url, video_id)
23be51d8 611 else:
9b122384 612 webpage = self._download_webpage(url, video_id)
9b122384 613 self.report_extraction(video_id)
887c6acd 614
4fc946b5
PH
615 # Is it an RSS feed?
616 try:
bcf89ce6 617 doc = parse_xml(webpage)
4fc946b5
PH
618 if doc.tag == 'rss':
619 return self._extract_rss(url, video_id, doc)
f7300c5c 620 except compat_xml_parse_error:
4fc946b5
PH
621 pass
622
c8e9a235
PH
623 # Is it a Camtasia project?
624 camtasia_res = self._extract_camtasia(url, video_id, webpage)
625 if camtasia_res is not None:
626 return camtasia_res
627
14390730
S
628 # Sometimes embedded video player is hidden behind percent encoding
629 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
630 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
631 webpage = compat_urllib_parse.unquote(webpage)
632
887c6acd
PH
633 # it's tempting to parse this further, but you would
634 # have to take into account all the variations like
635 # Video Title - Site Name
636 # Site Name | Video Title
637 # Video Title - Tagline | Site Name
638 # and so on and so forth; it's just not practical
ef4fd848 639 video_title = self._html_search_regex(
79649588
PH
640 r'(?s)<title>(.*?)</title>', webpage, 'video title',
641 default='video')
ef4fd848 642
4d805e06
PH
643 # Try to detect age limit automatically
644 age_limit = self._rta_search(webpage)
645 # And then there are the jokers who advertise that they use RTA,
646 # but actually don't.
647 AGE_LIMIT_MARKERS = [
648 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
649 ]
650 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
651 age_limit = 18
652
ef4fd848
PH
653 # video uploader is domain name
654 video_uploader = self._search_regex(
79649588 655 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 656
ed2d6a19
PH
657 # Helper method
658 def _playlist_from_matches(matches, getter, ie=None):
3b2f933b
PH
659 urlrs = orderedSet(
660 self.url_result(self._proto_relative_url(getter(m)), ie)
661 for m in matches)
ed2d6a19
PH
662 return self.playlist_result(
663 urlrs, playlist_id=video_id, playlist_title=video_title)
664
627a91a9 665 # Look for BrightCove:
99877772
PH
666 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
667 if bc_urls:
79649588 668 self.to_screen('Brightcove video detected.')
99877772
PH
669 entries = [{
670 '_type': 'url',
671 'url': smuggle_url(bc_url, {'Referer': url}),
672 'ie_key': 'Brightcove'
673 } for bc_url in bc_urls]
674
675 return {
676 '_type': 'playlist',
677 'title': video_title,
678 'id': video_id,
679 'entries': entries,
680 }
cfe50f04 681
7115ca84 682 # Look for embedded (iframe) Vimeo player
9d4660ca 683 mobj = re.search(
15fd51b3 684 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 685 if mobj:
15fd51b3 686 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 687 surl = smuggle_url(player_url, {'Referer': url})
09a42738 688 return self.url_result(surl)
9d4660ca 689
7115ca84
PH
690 # Look for embedded (swf embed) Vimeo player
691 mobj = re.search(
09a42738 692 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 693 if mobj:
09a42738 694 return self.url_result(mobj.group(1))
7115ca84 695
53c1d3ef 696 # Look for embedded YouTube player
1f9da904 697 matches = re.findall(r'''(?x)
2b88feed
PH
698 (?:
699 <iframe[^>]+?src=|
c71dfccc 700 data-video-url=|
2b88feed 701 <embed[^>]+?src=|
a7e97f6d
PH
702 embedSWF\(?:\s*|
703 new\s+SWFObject\(
2b88feed
PH
704 )
705 (["\'])
1bf5423e 706 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 707 (?:embed|v|p)/.+?)
1f9da904 708 \1''', webpage)
887c6acd 709 if matches:
ed2d6a19 710 return _playlist_from_matches(
3b2f933b 711 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 712
65f3a228
PH
713 # Look for lazyYT YouTube embed
714 matches = re.findall(
715 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
716 if matches:
717 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
718
355e4fd0
PH
719 # Look for embedded Dailymotion player
720 matches = re.findall(
ef4fd848 721 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 722 if matches:
ed2d6a19
PH
723 return _playlist_from_matches(
724 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 725
8489578d
NJ
726 # Look for embedded Dailymotion playlist player (#3822)
727 m = re.search(
728 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
729 if m:
730 playlists = re.findall(
731 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
732 if playlists:
733 return _playlist_from_matches(
734 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
735
ef4fd848
PH
736 # Look for embedded Wistia player
737 match = re.search(
281d3f1d 738 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 739 if match:
9471c444
NJ
740 embed_url = self._proto_relative_url(
741 unescapeHTML(match.group('url')))
ef4fd848
PH
742 return {
743 '_type': 'url_transparent',
9471c444 744 'url': embed_url,
ef4fd848
PH
745 'ie_key': 'Wistia',
746 'uploader': video_uploader,
747 'title': video_title,
748 'id': video_id,
749 }
5f6a1245 750
9471c444 751 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
752 if match:
753 return {
754 '_type': 'url_transparent',
755 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
756 'ie_key': 'Wistia',
757 'uploader': video_uploader,
758 'title': video_title,
759 'id': match.group('id')
760 }
ef4fd848 761
ee3e63e4 762 # Look for embedded blip.tv player
19dab5e6 763 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 764 if mobj:
2514d263 765 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 766 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 767 if mobj:
19dab5e6 768 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 769
fa35cdad
PH
770 # Look for embedded condenast player
771 matches = re.findall(
772 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
773 webpage)
774 if matches:
775 return {
776 '_type': 'playlist',
777 'entries': [{
778 '_type': 'url',
779 'ie_key': 'CondeNast',
780 'url': ma,
781 } for ma in matches],
782 'title': video_title,
783 'id': video_id,
784 }
785
c19f7764
JMF
786 # Look for Bandcamp pages with custom domain
787 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
788 if mobj is not None:
789 burl = unescapeHTML(mobj.group(1))
09804265
JMF
790 # Don't set the extractor because it can be a track url or an album
791 return self.url_result(burl)
c19f7764 792
f25571ff
PH
793 # Look for embedded Vevo player
794 mobj = re.search(
795 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
796 if mobj is not None:
797 return self.url_result(mobj.group('url'))
798
c0d0b01f 799 # Look for Ooyala videos
750f9020 800 mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
9e1a5b84 801 re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 802 if mobj is not None:
750f9020 803 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 804
aa94a6d3 805 # Look for Aparat videos
48099643 806 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
807 if mobj is not None:
808 return self.url_result(mobj.group(1), 'Aparat')
809
c93c2ab1 810 # Look for MPORA videos
c3f51436 811 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
812 if mobj is not None:
813 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 814
15c0e8e7 815 # Look for embedded NovaMov-based player
8f89e687 816 mobj = re.search(
8dfa187b 817 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
818 (?P<url>http://(?:(?:embed|www)\.)?
819 (?:novamov\.com|
820 nowvideo\.(?:ch|sx|eu|at|ag|co)|
821 videoweed\.(?:es|com)|
822 movshare\.(?:net|sx|ag)|
823 divxstage\.(?:eu|net|ch|co|at|ag))
824 /embed\.php.+?)\1''', webpage)
8f89e687 825 if mobj is not None:
15c0e8e7 826 return self.url_result(mobj.group('url'))
50f56607 827
9834872b
PH
828 # Look for embedded Facebook player
829 mobj = re.search(
db1f3888 830 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
831 if mobj is not None:
832 return self.url_result(mobj.group('url'), 'Facebook')
833
ca97a56e
S
834 # Look for embedded VK player
835 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
836 if mobj is not None:
837 return self.url_result(mobj.group('url'), 'VK')
838
0364fa8b
S
839 # Look for embedded ivi player
840 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
841 if mobj is not None:
842 return self.url_result(mobj.group('url'), 'Ivi')
843
db1f3888
PH
844 # Look for embedded Huffington Post player
845 mobj = re.search(
c3f51436 846 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
847 if mobj is not None:
848 return self.url_result(mobj.group('url'), 'HuffPost')
849
1b86cc41 850 # Look for embed.ly
851 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
852 if mobj is not None:
853 return self.url_result(mobj.group('url'))
854 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
855 if mobj is not None:
856 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
857
60cc4dc4
PH
858 # Look for funnyordie embed
859 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
860 if matches:
ed2d6a19
PH
861 return _playlist_from_matches(
862 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 863
93d020dd
S
864 # Look for embedded RUTV player
865 rutv_url = RUTVIE._extract_url(webpage)
866 if rutv_url:
867 return self.url_result(rutv_url, 'RUTV')
868
7e2ede98
JMF
869 # Look for embedded TED player
870 mobj = re.search(
871 r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
872 if mobj is not None:
873 return self.url_result(mobj.group('url'), 'TED')
874
5c386252 875 # Look for embedded Ustream videos
876 mobj = re.search(
877 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
878 if mobj is not None:
879 return self.url_result(mobj.group('url'), 'Ustream')
880
893f8832
PH
881 # Look for embedded arte.tv player
882 mobj = re.search(
883 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
884 webpage)
885 if mobj is not None:
886 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
887
cb3ac1c6
S
888 # Look for embedded smotri.com player
889 smotri_url = SmotriIE._extract_url(webpage)
890 if smotri_url:
891 return self.url_result(smotri_url, 'Smotri')
892
20991253
PH
893 # Look for embeded soundcloud player
894 mobj = re.search(
ac645ac7 895 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
896 webpage)
897 if mobj is not None:
898 url = unescapeHTML(mobj.group('url'))
899 return self.url_result(url)
900
826ec77f
PH
901 # Look for embedded vulture.com player
902 mobj = re.search(
903 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
904 webpage)
905 if mobj is not None:
906 url = unescapeHTML(mobj.group('url'))
907 return self.url_result(url, ie='Vulture')
908
c5cd249e
JMF
909 # Look for embedded mtvservices player
910 mobj = re.search(
911 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
912 webpage)
913 if mobj is not None:
914 url = unescapeHTML(mobj.group('url'))
915 return self.url_result(url, ie='MTVServicesEmbedded')
916
49807b4a
S
917 # Look for embedded yahoo player
918 mobj = re.search(
919 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
920 webpage)
921 if mobj is not None:
922 return self.url_result(mobj.group('url'), 'Yahoo')
923
2ef6fcb5
PH
924 # Look for embedded sbs.com.au player
925 mobj = re.search(
926 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
927 webpage)
928 if mobj is not None:
929 return self.url_result(mobj.group('url'), 'SBS')
930
1a94ff68 931 mobj = re.search(
5263cdfc 932 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
933 webpage)
934 if mobj is not None:
935 return self.url_result(mobj.group('url'), 'MLB')
936
1419fafd
S
937 mobj = re.search(
938 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
939 webpage)
940 if mobj is not None:
941 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
942
af63fed7
PH
943 mobj = re.search(
944 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
945 webpage)
946 if mobj is not None:
947 return self.url_result(mobj.group('url'), 'Livestream')
948
ced659bb
S
949 def check_video(vurl):
950 vpath = compat_urlparse.urlparse(vurl).path
951 vext = determine_ext(vpath)
952 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
953
954 def filter_video(urls):
955 return list(filter(check_video, urls))
956
9b122384 957 # Start with something easy: JW Player in SWFObject
ced659bb 958 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 959 if not found:
d981cef6 960 # Look for gorilla-vid style embedding
ced659bb 961 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
962 (?:
963 jw_plugins|
964 JWPlayerOptions|
965 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
966 )
ced659bb 967 .*?file\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 968 if not found:
9b122384 969 # Broaden the search a little bit
ced659bb 970 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
971 if not found:
972 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
973 found = filter_video(re.findall(
974 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
975 if not found:
976 # Flow player
ced659bb 977 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
978 flowplayer\("[^"]+",\s*
979 \{[^}]+?\}\s*,
980 \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
981 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 982 ''', webpage))
b30b8698 983 if not found:
9b122384 984 # Try to find twitter cards info
ced659bb
S
985 found = filter_video(re.findall(
986 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 987 if not found:
9b122384
PH
988 # We look for Open Graph info:
989 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 990 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
991 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
992 if m_video_type is not None:
ced659bb 993 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 994 if not found:
7fea7156 995 # HTML5 video
9b32eca3 996 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 997 if not found:
a5a45015 998 found = re.search(
89ef304b 999 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 1000 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 1001 webpage)
b30b8698
PH
1002 if found:
1003 new_url = found.group(1)
89ef304b
PH
1004 self.report_following_redirect(new_url)
1005 return {
1006 '_type': 'url',
1007 'url': new_url,
1008 }
b30b8698 1009 if not found:
79649588 1010 raise ExtractorError('Unsupported URL: %s' % url)
9b122384 1011
b30b8698
PH
1012 entries = []
1013 for video_url in found:
1014 video_url = compat_urlparse.urljoin(url, video_url)
1015 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1016
b30b8698
PH
1017 # Sometimes, jwplayer extraction will result in a YouTube URL
1018 if YoutubeIE.suitable(video_url):
1019 entries.append(self.url_result(video_url, 'Youtube'))
1020 continue
9b122384 1021
b30b8698
PH
1022 # here's a fun little line of code for you:
1023 video_id = os.path.splitext(video_id)[0]
fc9713a1 1024
b30b8698
PH
1025 entries.append({
1026 'id': video_id,
1027 'url': video_url,
1028 'uploader': video_uploader,
1029 'title': video_title,
4d805e06 1030 'age_limit': age_limit,
b30b8698
PH
1031 })
1032
1033 if len(entries) == 1:
669f0e7c 1034 return entries[0]
b30b8698
PH
1035 else:
1036 for num, e in enumerate(entries, start=1):
1037 e['title'] = '%s (%d)' % (e['title'], num)
1038 return {
1039 '_type': 'playlist',
1040 'entries': entries,
1041 }