]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
Credit @yaccz for freevideo (#4131)
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
ed2d6a19 20 orderedSet,
bcf89ce6 21 parse_xml,
9d4660ca
PH
22 smuggle_url,
23 unescapeHTML,
42393ce2 24 unified_strdate,
4d54ef20 25 unsmuggle_url,
42393ce2 26 url_basename,
9b122384 27)
cfe50f04 28from .brightcove import BrightcoveIE
c0d0b01f 29from .ooyala import OoyalaIE
93d020dd 30from .rutv import RUTVIE
cb3ac1c6 31from .smotri import SmotriIE
1419fafd 32from .condenast import CondeNastIE
9b122384 33
0838239e 34
9b122384 35class GenericIE(InfoExtractor):
79649588 36 IE_DESC = 'Generic downloader that works on some sites'
9b122384 37 _VALID_URL = r'.*'
79649588 38 IE_NAME = 'generic'
cfe50f04
JMF
39 _TESTS = [
40 {
79649588 41 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 42 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 43 'info_dict': {
d360a146
S
44 'id': '13601338388002',
45 'ext': 'mp4',
79649588
PH
46 'uploader': 'www.hodiho.fr',
47 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
48 }
49 },
c19f7764
JMF
50 # bandcamp page with custom domain
51 {
79649588
PH
52 'add_ie': ['Bandcamp'],
53 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 54 'info_dict': {
fd50bf62
S
55 'id': '3235767654',
56 'ext': 'mp3',
79649588
PH
57 'title': 'The Pony Mash',
58 'uploader': 'M_Pallante',
c19f7764 59 },
79649588 60 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 61 },
eeb165e6 62 # embedded brightcove video
dd5bcdc4
JMF
63 # it also tests brightcove videos that need to set the 'Referer' in the
64 # http requests
eeb165e6 65 {
79649588
PH
66 'add_ie': ['Brightcove'],
67 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
68 'info_dict': {
69 'id': '2765128793001',
70 'ext': 'mp4',
71 'title': 'Le cours de bourse : l’analyse technique',
72 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
73 'uploader': 'BFM BUSINESS',
eeb165e6 74 },
79649588
PH
75 'params': {
76 'skip_download': True,
eeb165e6
JMF
77 },
78 },
17ab4d3b
PH
79 {
80 # https://github.com/rg3/youtube-dl/issues/2253
81 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
82 'md5': '0ba9446db037002366bab3b3eb30c88c',
83 'info_dict': {
fd50bf62
S
84 'id': '3101154703001',
85 'ext': 'mp4',
17ab4d3b
PH
86 'title': 'Still no power',
87 'uploader': 'thestar.com',
88 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
89 },
90 'add_ie': ['Brightcove'],
91 },
0479c625
S
92 {
93 'url': 'http://www.championat.com/video/football/v/87/87499.html',
94 'md5': 'fb973ecf6e4a78a67453647444222983',
95 'info_dict': {
96 'id': '3414141473001',
97 'ext': 'mp4',
98 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
99 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
100 'uploader': 'Championat',
101 },
102 },
bdf97017 103 {
37aab278 104 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
105 'add_ie': ['Brightcove'],
106 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
107 'info_dict': {
108 'id': '3866516442001',
37aab278 109 'ext': 'mp4',
bdf97017
NJ
110 'title': 'Leer mij vrouwen kennen: Aflevering 1',
111 'description': 'Leer mij vrouwen kennen: Aflevering 1',
112 'uploader': 'SBS Broadcasting',
113 },
37aab278 114 'skip': 'Restricted to Netherlands',
bdf97017 115 'params': {
37aab278 116 'skip_download': True, # m3u8 download
bdf97017
NJ
117 },
118 },
42393ce2
PH
119 # Direct link to a video
120 {
79649588 121 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
122 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
123 'info_dict': {
124 'id': 'trailer',
89ef304b 125 'ext': 'mp4',
79649588
PH
126 'title': 'trailer',
127 'upload_date': '20100513',
42393ce2 128 }
c0d0b01f
JMF
129 },
130 # ooyala video
131 {
79649588
PH
132 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
133 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
134 'info_dict': {
135 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
136 'ext': 'mp4',
3486df38 137 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f
JMF
138 },
139 },
89ef304b
PH
140 # google redirect
141 {
142 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
143 'info_dict': {
144 'id': 'cmQHVoWB5FY',
145 'ext': 'mp4',
146 'upload_date': '20130224',
147 'uploader_id': 'TheVerge',
148 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
149 'uploader': 'The Verge',
150 'title': 'First Firefox OS phones side-by-side',
151 },
152 'params': {
153 'skip_download': False,
154 }
f55a1f0a 155 },
1b86cc41 156 # embed.ly video
157 {
158 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
159 'info_dict': {
160 'id': '9ODmcdjQcHQ',
161 'ext': 'mp4',
0a5bce56
PH
162 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
163 'upload_date': '20140225',
164 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
165 'uploader': 'Tested',
166 'uploader_id': 'testedcom',
1b86cc41 167 },
168 # No need to test YoutubeIE here
169 'params': {
170 'skip_download': True,
171 },
172 },
60cc4dc4
PH
173 # funnyordie embed
174 {
175 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
176 'info_dict': {
177 'id': '18e820ec3f',
178 'ext': 'mp4',
179 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
180 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 181 },
60cc4dc4 182 },
93d020dd
S
183 # RUTV embed
184 {
185 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
186 'info_dict': {
187 'id': '776940',
188 'ext': 'mp4',
189 'title': 'Охотское море стало целиком российским',
190 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
191 },
192 'params': {
193 # m3u8 download
194 'skip_download': True,
195 },
aab74fa1
PH
196 },
197 # Embedded TED video
198 {
199 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 200 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 201 'info_dict': {
a8eb5a8e 202 'id': '1969',
aab74fa1 203 'ext': 'mp4',
a8eb5a8e
PH
204 'title': 'Hidden miracles of the natural world',
205 'uploader': 'Louie Schwartzberg',
206 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 207 }
60cc4dc4 208 },
5c386252 209 # Embeded Ustream video
210 {
211 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
212 'md5': '27b99cdb639c9b12a79bca876a073417',
213 'info_dict': {
ca6aada4 214 'id': '45734260',
215 'ext': 'flv',
216 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 217 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
218 }
219 },
d95e35d6
S
220 # nowvideo embed hidden behind percent encoding
221 {
222 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
223 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
224 'info_dict': {
225 'id': '06e53103ca9aa',
226 'ext': 'flv',
227 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
228 'description': 'No description',
229 },
0f2a2ba1 230 },
893f8832
PH
231 # arte embed
232 {
233 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
234 'md5': '7653032cbb25bf6c80d80f217055fa43',
235 'info_dict': {
236 'id': '048195-004_PLUS7-F',
237 'ext': 'flv',
238 'title': 'X:enius',
239 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
240 'upload_date': '20140320',
241 },
242 'params': {
243 'skip_download': 'Requires rtmpdump'
244 }
245 },
fa35cdad
PH
246 # Condé Nast embed
247 {
248 'url': 'http://www.wired.com/2014/04/honda-asimo/',
249 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
250 'info_dict': {
251 'id': '53501be369702d3275860000',
252 'ext': 'mp4',
253 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
254 }
ebd3c7b3
PH
255 },
256 # Dailymotion embed
257 {
258 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
259 'md5': '441aeeb82eb72c422c7f14ec533999cd',
260 'info_dict': {
261 'id': 'k2mm4bCdJ6CQ2i7c8o2',
262 'ext': 'mp4',
263 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
264 'uploader': 'Spi0n',
265 },
266 'add_ie': ['Dailymotion'],
2b88feed
PH
267 },
268 # YouTube embed
269 {
270 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
271 'info_dict': {
272 'id': 'FXRb4ykk4S0',
273 'ext': 'mp4',
274 'title': 'The NBL Auction 2014',
275 'uploader': 'BADMINTON England',
276 'uploader_id': 'BADMINTONEvents',
277 'upload_date': '20140603',
278 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
279 },
280 'add_ie': ['Youtube'],
281 'params': {
282 'skip_download': True,
283 }
284 },
c5cd249e
JMF
285 # MTVSercices embed
286 {
287 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
288 'md5': '35727f82f58c76d996fc188f9755b0d5',
289 'info_dict': {
290 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
291 'ext': 'mp4',
292 'title': 'Review',
293 'description': 'Mario\'s life in the fast lane has never looked so good.',
294 },
295 },
61013473 296 # YouTube embed via <data-embed-url="">
297 {
298 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 299 'info_dict': {
a8eb5a8e 300 'id': '4vAffPZIT44',
61013473 301 'ext': 'mp4',
a8eb5a8e 302 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
303 'uploader': 'Gameloft',
304 'uploader_id': 'gameloft',
a8eb5a8e
PH
305 'upload_date': '20140828',
306 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
307 },
308 'params': {
309 'skip_download': True,
61013473 310 }
c8e9a235
PH
311 },
312 # Camtasia studio
313 {
314 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
315 'playlist': [{
316 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
317 'info_dict': {
318 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
319 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
320 'ext': 'flv',
321 'duration': 2235.90,
322 }
323 }, {
324 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
325 'info_dict': {
326 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
327 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
328 'ext': 'flv',
329 'duration': 2235.93,
330 }
331 }],
332 'info_dict': {
333 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
334 }
4d805e06
PH
335 },
336 # Flowplayer
337 {
338 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
339 'md5': '9d65602bf31c6e20014319c7d07fba27',
340 'info_dict': {
341 'id': '5123ea6d5e5a7',
342 'ext': 'mp4',
343 'age_limit': 18,
344 'uploader': 'www.handjobhub.com',
d6d9186f 345 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 346 }
0990305d
PH
347 },
348 # RSS feed
349 {
350 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
351 'info_dict': {
352 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
353 'title': 'Zero Punctuation',
354 'description': 're:'
355 },
356 'playlist_mincount': 11,
22a6f150
PH
357 },
358 # Multiple brightcove videos
359 # https://github.com/rg3/youtube-dl/issues/2283
360 {
361 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
362 'info_dict': {
363 'id': 'always-never',
364 'title': 'Always / Never - The New Yorker',
365 },
366 'playlist_count': 3,
367 'params': {
368 'extract_flat': False,
369 'skip_download': True,
370 }
1a94ff68
S
371 },
372 # MLB embed
373 {
374 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
375 'md5': '96f09a37e44da40dd083e12d9a683327',
376 'info_dict': {
377 'id': '33322633',
378 'ext': 'mp4',
379 'title': 'Ump changes call to ball',
380 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
381 'duration': 48,
382 'timestamp': 1401537900,
383 'upload_date': '20140531',
384 'thumbnail': 're:^https?://.*\.jpg$',
385 },
386 },
746c67d7
NJ
387 # Wistia embed
388 {
389 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
390 'md5': '8788b683c777a5cf25621eaf286d0c23',
391 'info_dict': {
392 'id': '1cfaf6b7ea',
393 'ext': 'mov',
394 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
395 'duration': 643.0,
396 'filesize': 182808282,
397 'uploader': 'education-portal.com',
398 },
399 },
52cffcb1 400 {
401 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
402 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
403 'info_dict': {
404 'id': 'uxjb0lwrcz',
405 'ext': 'mp4',
85d7b765 406 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 407 'duration': 1715.0,
85d7b765 408 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 409 },
52cffcb1 410 },
70b7e3fb
PH
411 # Direct download with broken HEAD
412 {
413 'url': 'http://ai-radio.org:8000/radio.opus',
414 'info_dict': {
415 'id': 'radio',
416 'ext': 'opus',
417 'title': 'radio',
418 },
419 'params': {
420 'skip_download': True, # infinite live stream
421 },
422 'expected_warnings': [
423 r'501.*Not Implemented'
424 ],
ac645ac7
PH
425 },
426 # Soundcloud embed
427 {
428 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
429 'info_dict': {
430 'id': '174391317',
431 'ext': 'mp3',
432 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
433 'uploader': 'Sophos Security',
434 'title': 'Chet Chat 171 - Oct 29, 2014',
435 'upload_date': '20141029',
436 }
70b7e3fb 437 }
cfe50f04 438 ]
9b122384 439
9b122384
PH
440 def report_following_redirect(self, new_url):
441 """Report information extraction."""
79649588 442 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 443
4fc946b5
PH
444 def _extract_rss(self, url, video_id, doc):
445 playlist_title = doc.find('./channel/title').text
446 playlist_desc_el = doc.find('./channel/description')
447 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
448
449 entries = [{
450 '_type': 'url',
451 'url': e.find('link').text,
452 'title': e.find('title').text,
453 } for e in doc.findall('./channel/item')]
454
455 return {
456 '_type': 'playlist',
457 'id': url,
458 'title': playlist_title,
459 'description': playlist_desc,
460 'entries': entries,
461 }
462
c8e9a235
PH
463 def _extract_camtasia(self, url, video_id, webpage):
464 """ Returns None if no camtasia video can be found. """
465
466 camtasia_cfg = self._search_regex(
467 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
468 webpage, 'camtasia configuration file', default=None)
469 if camtasia_cfg is None:
470 return None
471
472 title = self._html_search_meta('DC.title', webpage, fatal=True)
473
474 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
475 camtasia_cfg = self._download_xml(
476 camtasia_url, video_id,
477 note='Downloading camtasia configuration',
478 errnote='Failed to download camtasia configuration')
479 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
480
481 entries = []
482 for n in fileset_node.getchildren():
483 url_n = n.find('./uri')
484 if url_n is None:
485 continue
486
487 entries.append({
488 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
489 'title': '%s - %s' % (title, n.tag),
490 'url': compat_urlparse.urljoin(url, url_n.text),
491 'duration': float_or_none(n.find('./duration').text),
492 })
493
494 return {
495 '_type': 'playlist',
496 'entries': entries,
497 'title': title,
498 }
499
9b122384 500 def _real_extract(self, url):
ebd3c7b3
PH
501 if url.startswith('//'):
502 return {
503 '_type': 'url',
20991253 504 'url': self.http_scheme() + url,
ebd3c7b3
PH
505 }
506
a7130543
JMF
507 parsed_url = compat_urlparse.urlparse(url)
508 if not parsed_url.scheme:
04b4d394
PH
509 default_search = self._downloader.params.get('default_search')
510 if default_search is None:
1f7ccb90 511 default_search = 'fixup_error'
04b4d394 512
1f7ccb90 513 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
514 if '/' in url:
515 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
516 return self.url_result('http://' + url)
1f7ccb90 517 elif default_search != 'fixup_error':
9c1fc022 518 if default_search == 'auto_warning':
0e67ab0d
PH
519 if re.match(r'^(?:url|URL)$', url):
520 raise ExtractorError(
521 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
522 expected=True)
523 else:
524 self._downloader.report_warning(
7571c02c 525 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 526 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
527
528 if default_search in ('error', 'fixup_error'):
7571c02c
PH
529 raise ExtractorError(
530 ('%r is not a valid URL. '
eef4a7a3 531 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
7571c02c 532 ) % (url, url), expected=True)
04b4d394 533 else:
f2f2c0c2
PH
534 if ':' not in default_search:
535 default_search += ':'
04b4d394 536 return self.url_result(default_search + url)
4d54ef20
PH
537
538 url, smuggled_data = unsmuggle_url(url)
539 force_videoid = None
d6e6a422 540 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
541 if smuggled_data and 'force_videoid' in smuggled_data:
542 force_videoid = smuggled_data['force_videoid']
543 video_id = force_videoid
544 else:
545 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 546
79649588 547 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 548
ebab4520 549 head_req = HEADRequest(url)
23be51d8 550 head_response = self._request_webpage(
ebab4520
PH
551 head_req, video_id,
552 note=False, errnote='Could not send HEAD request to %s' % url,
553 fatal=False)
42393ce2 554
23be51d8 555 if head_response is not False:
42393ce2 556 # Check for redirect
23be51d8 557 new_url = head_response.geturl()
42393ce2
PH
558 if url != new_url:
559 self.report_following_redirect(new_url)
4d54ef20
PH
560 if force_videoid:
561 new_url = smuggle_url(
562 new_url, {'force_videoid': force_videoid})
cecaaf3f 563 return self.url_result(new_url)
42393ce2 564
23be51d8
PH
565 full_response = None
566 if head_response is False:
567 full_response = self._request_webpage(url, video_id)
568 head_response = full_response
569
570 # Check for direct link to a video
571 content_type = head_response.headers.get('Content-Type', '')
572 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
573 if m:
574 upload_date = unified_strdate(
575 head_response.headers.get('Last-Modified'))
576 return {
577 'id': video_id,
578 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 579 'direct': True,
23be51d8
PH
580 'formats': [{
581 'format_id': m.group('format_id'),
582 'url': url,
583 'vcodec': 'none' if m.group('type') == 'audio' else None
584 }],
585 'upload_date': upload_date,
586 }
42393ce2 587
d6e6a422
PH
588 if not self._downloader.params.get('test', False) and not is_intentional:
589 self._downloader.report_warning('Falling back on generic information extractor.')
590
23be51d8 591 if full_response:
37d66e7f 592 webpage = self._webpage_read_content(full_response, url, video_id)
23be51d8 593 else:
9b122384 594 webpage = self._download_webpage(url, video_id)
9b122384 595 self.report_extraction(video_id)
887c6acd 596
4fc946b5
PH
597 # Is it an RSS feed?
598 try:
bcf89ce6 599 doc = parse_xml(webpage)
4fc946b5
PH
600 if doc.tag == 'rss':
601 return self._extract_rss(url, video_id, doc)
f7300c5c 602 except compat_xml_parse_error:
4fc946b5
PH
603 pass
604
c8e9a235
PH
605 # Is it a Camtasia project?
606 camtasia_res = self._extract_camtasia(url, video_id, webpage)
607 if camtasia_res is not None:
608 return camtasia_res
609
14390730
S
610 # Sometimes embedded video player is hidden behind percent encoding
611 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
612 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
613 webpage = compat_urllib_parse.unquote(webpage)
614
887c6acd
PH
615 # it's tempting to parse this further, but you would
616 # have to take into account all the variations like
617 # Video Title - Site Name
618 # Site Name | Video Title
619 # Video Title - Tagline | Site Name
620 # and so on and so forth; it's just not practical
ef4fd848 621 video_title = self._html_search_regex(
79649588
PH
622 r'(?s)<title>(.*?)</title>', webpage, 'video title',
623 default='video')
ef4fd848 624
4d805e06
PH
625 # Try to detect age limit automatically
626 age_limit = self._rta_search(webpage)
627 # And then there are the jokers who advertise that they use RTA,
628 # but actually don't.
629 AGE_LIMIT_MARKERS = [
630 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
631 ]
632 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
633 age_limit = 18
634
ef4fd848
PH
635 # video uploader is domain name
636 video_uploader = self._search_regex(
79649588 637 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 638
ed2d6a19
PH
639 # Helper method
640 def _playlist_from_matches(matches, getter, ie=None):
3b2f933b
PH
641 urlrs = orderedSet(
642 self.url_result(self._proto_relative_url(getter(m)), ie)
643 for m in matches)
ed2d6a19
PH
644 return self.playlist_result(
645 urlrs, playlist_id=video_id, playlist_title=video_title)
646
627a91a9 647 # Look for BrightCove:
99877772
PH
648 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
649 if bc_urls:
79649588 650 self.to_screen('Brightcove video detected.')
99877772
PH
651 entries = [{
652 '_type': 'url',
653 'url': smuggle_url(bc_url, {'Referer': url}),
654 'ie_key': 'Brightcove'
655 } for bc_url in bc_urls]
656
657 return {
658 '_type': 'playlist',
659 'title': video_title,
660 'id': video_id,
661 'entries': entries,
662 }
cfe50f04 663
7115ca84 664 # Look for embedded (iframe) Vimeo player
9d4660ca 665 mobj = re.search(
15fd51b3 666 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 667 if mobj:
15fd51b3 668 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 669 surl = smuggle_url(player_url, {'Referer': url})
09a42738 670 return self.url_result(surl)
9d4660ca 671
7115ca84
PH
672 # Look for embedded (swf embed) Vimeo player
673 mobj = re.search(
09a42738 674 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 675 if mobj:
09a42738 676 return self.url_result(mobj.group(1))
7115ca84 677
53c1d3ef 678 # Look for embedded YouTube player
1f9da904 679 matches = re.findall(r'''(?x)
2b88feed
PH
680 (?:
681 <iframe[^>]+?src=|
c71dfccc 682 data-video-url=|
2b88feed 683 <embed[^>]+?src=|
a7e97f6d
PH
684 embedSWF\(?:\s*|
685 new\s+SWFObject\(
2b88feed
PH
686 )
687 (["\'])
1bf5423e 688 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 689 (?:embed|v|p)/.+?)
1f9da904 690 \1''', webpage)
887c6acd 691 if matches:
ed2d6a19 692 return _playlist_from_matches(
3b2f933b 693 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 694
355e4fd0
PH
695 # Look for embedded Dailymotion player
696 matches = re.findall(
ef4fd848 697 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 698 if matches:
ed2d6a19
PH
699 return _playlist_from_matches(
700 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 701
8489578d
NJ
702 # Look for embedded Dailymotion playlist player (#3822)
703 m = re.search(
704 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
705 if m:
706 playlists = re.findall(
707 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
708 if playlists:
709 return _playlist_from_matches(
710 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
711
ef4fd848
PH
712 # Look for embedded Wistia player
713 match = re.search(
281d3f1d 714 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 715 if match:
9471c444
NJ
716 embed_url = self._proto_relative_url(
717 unescapeHTML(match.group('url')))
ef4fd848
PH
718 return {
719 '_type': 'url_transparent',
9471c444 720 'url': embed_url,
ef4fd848
PH
721 'ie_key': 'Wistia',
722 'uploader': video_uploader,
723 'title': video_title,
724 'id': video_id,
725 }
52cffcb1 726
9471c444 727 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
728 if match:
729 return {
730 '_type': 'url_transparent',
731 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
732 'ie_key': 'Wistia',
733 'uploader': video_uploader,
734 'title': video_title,
735 'id': match.group('id')
736 }
ef4fd848 737
ee3e63e4 738 # Look for embedded blip.tv player
19dab5e6 739 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 740 if mobj:
19dab5e6 741 return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
1f8b6af7 742 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 743 if mobj:
19dab5e6 744 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 745
fa35cdad
PH
746 # Look for embedded condenast player
747 matches = re.findall(
748 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
749 webpage)
750 if matches:
751 return {
752 '_type': 'playlist',
753 'entries': [{
754 '_type': 'url',
755 'ie_key': 'CondeNast',
756 'url': ma,
757 } for ma in matches],
758 'title': video_title,
759 'id': video_id,
760 }
761
c19f7764
JMF
762 # Look for Bandcamp pages with custom domain
763 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
764 if mobj is not None:
765 burl = unescapeHTML(mobj.group(1))
09804265
JMF
766 # Don't set the extractor because it can be a track url or an album
767 return self.url_result(burl)
c19f7764 768
f25571ff
PH
769 # Look for embedded Vevo player
770 mobj = re.search(
771 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
772 if mobj is not None:
773 return self.url_result(mobj.group('url'))
774
c0d0b01f 775 # Look for Ooyala videos
750f9020
JMF
776 mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
777 re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 778 if mobj is not None:
750f9020 779 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 780
aa94a6d3 781 # Look for Aparat videos
48099643 782 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
783 if mobj is not None:
784 return self.url_result(mobj.group(1), 'Aparat')
785
c93c2ab1 786 # Look for MPORA videos
c3f51436 787 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
788 if mobj is not None:
789 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 790
15c0e8e7 791 # Look for embedded NovaMov-based player
8f89e687 792 mobj = re.search(
8dfa187b 793 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
794 (?P<url>http://(?:(?:embed|www)\.)?
795 (?:novamov\.com|
796 nowvideo\.(?:ch|sx|eu|at|ag|co)|
797 videoweed\.(?:es|com)|
798 movshare\.(?:net|sx|ag)|
799 divxstage\.(?:eu|net|ch|co|at|ag))
800 /embed\.php.+?)\1''', webpage)
8f89e687 801 if mobj is not None:
15c0e8e7 802 return self.url_result(mobj.group('url'))
50f56607 803
9834872b
PH
804 # Look for embedded Facebook player
805 mobj = re.search(
db1f3888 806 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
807 if mobj is not None:
808 return self.url_result(mobj.group('url'), 'Facebook')
809
ca97a56e
S
810 # Look for embedded VK player
811 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
812 if mobj is not None:
813 return self.url_result(mobj.group('url'), 'VK')
814
0364fa8b
S
815 # Look for embedded ivi player
816 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
817 if mobj is not None:
818 return self.url_result(mobj.group('url'), 'Ivi')
819
db1f3888
PH
820 # Look for embedded Huffington Post player
821 mobj = re.search(
c3f51436 822 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
823 if mobj is not None:
824 return self.url_result(mobj.group('url'), 'HuffPost')
825
1b86cc41 826 # Look for embed.ly
827 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
828 if mobj is not None:
829 return self.url_result(mobj.group('url'))
830 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
831 if mobj is not None:
832 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
833
60cc4dc4
PH
834 # Look for funnyordie embed
835 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
836 if matches:
ed2d6a19
PH
837 return _playlist_from_matches(
838 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 839
93d020dd
S
840 # Look for embedded RUTV player
841 rutv_url = RUTVIE._extract_url(webpage)
842 if rutv_url:
843 return self.url_result(rutv_url, 'RUTV')
844
7e2ede98
JMF
845 # Look for embedded TED player
846 mobj = re.search(
847 r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
848 if mobj is not None:
849 return self.url_result(mobj.group('url'), 'TED')
850
5c386252 851 # Look for embedded Ustream videos
852 mobj = re.search(
853 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
854 if mobj is not None:
855 return self.url_result(mobj.group('url'), 'Ustream')
856
893f8832
PH
857 # Look for embedded arte.tv player
858 mobj = re.search(
859 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
860 webpage)
861 if mobj is not None:
862 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
863
cb3ac1c6
S
864 # Look for embedded smotri.com player
865 smotri_url = SmotriIE._extract_url(webpage)
866 if smotri_url:
867 return self.url_result(smotri_url, 'Smotri')
868
20991253
PH
869 # Look for embeded soundcloud player
870 mobj = re.search(
ac645ac7 871 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
872 webpage)
873 if mobj is not None:
874 url = unescapeHTML(mobj.group('url'))
875 return self.url_result(url)
876
826ec77f
PH
877 # Look for embedded vulture.com player
878 mobj = re.search(
879 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
880 webpage)
881 if mobj is not None:
882 url = unescapeHTML(mobj.group('url'))
883 return self.url_result(url, ie='Vulture')
884
c5cd249e
JMF
885 # Look for embedded mtvservices player
886 mobj = re.search(
887 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
888 webpage)
889 if mobj is not None:
890 url = unescapeHTML(mobj.group('url'))
891 return self.url_result(url, ie='MTVServicesEmbedded')
892
49807b4a
S
893 # Look for embedded yahoo player
894 mobj = re.search(
895 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
896 webpage)
897 if mobj is not None:
898 return self.url_result(mobj.group('url'), 'Yahoo')
899
2ef6fcb5
PH
900 # Look for embedded sbs.com.au player
901 mobj = re.search(
902 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
903 webpage)
904 if mobj is not None:
905 return self.url_result(mobj.group('url'), 'SBS')
906
1a94ff68 907 mobj = re.search(
5263cdfc 908 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
909 webpage)
910 if mobj is not None:
911 return self.url_result(mobj.group('url'), 'MLB')
912
1419fafd
S
913 mobj = re.search(
914 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
915 webpage)
916 if mobj is not None:
917 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
918
ced659bb
S
919 def check_video(vurl):
920 vpath = compat_urlparse.urlparse(vurl).path
921 vext = determine_ext(vpath)
922 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
923
924 def filter_video(urls):
925 return list(filter(check_video, urls))
926
9b122384 927 # Start with something easy: JW Player in SWFObject
ced659bb 928 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 929 if not found:
d981cef6 930 # Look for gorilla-vid style embedding
ced659bb 931 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
932 (?:
933 jw_plugins|
934 JWPlayerOptions|
935 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
936 )
ced659bb 937 .*?file\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 938 if not found:
9b122384 939 # Broaden the search a little bit
ced659bb 940 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
941 if not found:
942 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
943 found = filter_video(re.findall(
944 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
945 if not found:
946 # Flow player
ced659bb 947 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
948 flowplayer\("[^"]+",\s*
949 \{[^}]+?\}\s*,
950 \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
951 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 952 ''', webpage))
b30b8698 953 if not found:
9b122384 954 # Try to find twitter cards info
ced659bb
S
955 found = filter_video(re.findall(
956 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 957 if not found:
9b122384
PH
958 # We look for Open Graph info:
959 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 960 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
961 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
962 if m_video_type is not None:
ced659bb 963 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 964 if not found:
7fea7156 965 # HTML5 video
48a24ab7 966 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage)
b30b8698 967 if not found:
a5a45015 968 found = re.search(
89ef304b 969 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 970 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 971 webpage)
b30b8698
PH
972 if found:
973 new_url = found.group(1)
89ef304b
PH
974 self.report_following_redirect(new_url)
975 return {
976 '_type': 'url',
977 'url': new_url,
978 }
b30b8698 979 if not found:
79649588 980 raise ExtractorError('Unsupported URL: %s' % url)
9b122384 981
b30b8698
PH
982 entries = []
983 for video_url in found:
984 video_url = compat_urlparse.urljoin(url, video_url)
985 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 986
b30b8698
PH
987 # Sometimes, jwplayer extraction will result in a YouTube URL
988 if YoutubeIE.suitable(video_url):
989 entries.append(self.url_result(video_url, 'Youtube'))
990 continue
9b122384 991
b30b8698
PH
992 # here's a fun little line of code for you:
993 video_id = os.path.splitext(video_id)[0]
fc9713a1 994
b30b8698
PH
995 entries.append({
996 'id': video_id,
997 'url': video_url,
998 'uploader': video_uploader,
999 'title': video_title,
4d805e06 1000 'age_limit': age_limit,
b30b8698
PH
1001 })
1002
1003 if len(entries) == 1:
669f0e7c 1004 return entries[0]
b30b8698
PH
1005 else:
1006 for num, e in enumerate(entries, start=1):
1007 e['title'] = '%s (%d)' % (e['title'], num)
1008 return {
1009 '_type': 'playlist',
1010 'entries': entries,
1011 }
9b122384 1012