]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[generic] Add support for sportbox embeds
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
76c73715 29 xpath_text,
9b122384 30)
cfe50f04 31from .brightcove import BrightcoveIE
a2edf2e7 32from .nbc import NBCSportsVPlayerIE
c0d0b01f 33from .ooyala import OoyalaIE
93d020dd 34from .rutv import RUTVIE
d40a3b5b 35from .sportbox import SportBoxEmbedIE
cb3ac1c6 36from .smotri import SmotriIE
1419fafd 37from .condenast import CondeNastIE
418c5cc3 38from .udn import UDNEmbedIE
2fe1b5bd 39from .senateisvp import SenateISVPIE
0954cd8a 40from .bliptv import BlipTVIE
bab19a8e 41from .svt import SVTIE
9b122384 42
0838239e 43
9b122384 44class GenericIE(InfoExtractor):
79649588 45 IE_DESC = 'Generic downloader that works on some sites'
9b122384 46 _VALID_URL = r'.*'
79649588 47 IE_NAME = 'generic'
cfe50f04
JMF
48 _TESTS = [
49 {
79649588 50 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 51 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 52 'info_dict': {
d360a146
S
53 'id': '13601338388002',
54 'ext': 'mp4',
79649588
PH
55 'uploader': 'www.hodiho.fr',
56 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
57 }
58 },
c19f7764
JMF
59 # bandcamp page with custom domain
60 {
79649588
PH
61 'add_ie': ['Bandcamp'],
62 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 63 'info_dict': {
fd50bf62
S
64 'id': '3235767654',
65 'ext': 'mp3',
79649588
PH
66 'title': 'The Pony Mash',
67 'uploader': 'M_Pallante',
c19f7764 68 },
79649588 69 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 70 },
eeb165e6 71 # embedded brightcove video
dd5bcdc4
JMF
72 # it also tests brightcove videos that need to set the 'Referer' in the
73 # http requests
eeb165e6 74 {
79649588
PH
75 'add_ie': ['Brightcove'],
76 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
77 'info_dict': {
78 'id': '2765128793001',
79 'ext': 'mp4',
80 'title': 'Le cours de bourse : l’analyse technique',
81 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
82 'uploader': 'BFM BUSINESS',
eeb165e6 83 },
79649588
PH
84 'params': {
85 'skip_download': True,
eeb165e6
JMF
86 },
87 },
17ab4d3b
PH
88 {
89 # https://github.com/rg3/youtube-dl/issues/2253
90 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
91 'md5': '0ba9446db037002366bab3b3eb30c88c',
92 'info_dict': {
fd50bf62
S
93 'id': '3101154703001',
94 'ext': 'mp4',
17ab4d3b
PH
95 'title': 'Still no power',
96 'uploader': 'thestar.com',
97 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
98 },
99 'add_ie': ['Brightcove'],
100 },
0479c625
S
101 {
102 'url': 'http://www.championat.com/video/football/v/87/87499.html',
103 'md5': 'fb973ecf6e4a78a67453647444222983',
104 'info_dict': {
105 'id': '3414141473001',
106 'ext': 'mp4',
107 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
108 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
109 'uploader': 'Championat',
110 },
111 },
bdf97017 112 {
37aab278 113 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
114 'add_ie': ['Brightcove'],
115 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
116 'info_dict': {
117 'id': '3866516442001',
37aab278 118 'ext': 'mp4',
bdf97017
NJ
119 'title': 'Leer mij vrouwen kennen: Aflevering 1',
120 'description': 'Leer mij vrouwen kennen: Aflevering 1',
121 'uploader': 'SBS Broadcasting',
122 },
37aab278 123 'skip': 'Restricted to Netherlands',
bdf97017 124 'params': {
37aab278 125 'skip_download': True, # m3u8 download
bdf97017
NJ
126 },
127 },
42393ce2
PH
128 # Direct link to a video
129 {
79649588 130 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
131 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
132 'info_dict': {
133 'id': 'trailer',
89ef304b 134 'ext': 'mp4',
79649588
PH
135 'title': 'trailer',
136 'upload_date': '20100513',
42393ce2 137 }
c0d0b01f
JMF
138 },
139 # ooyala video
140 {
79649588 141 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 142 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
143 'info_dict': {
144 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
145 'ext': 'mp4',
3486df38 146 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 147 },
87830900 148 'add_ie': ['Ooyala'],
c0d0b01f 149 },
f076b638 150 # multiple ooyala embeds on SBN network websites
151 {
152 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
153 'info_dict': {
154 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
155 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
156 },
157 'playlist_mincount': 3,
158 'params': {
159 'skip_download': True,
160 },
161 'add_ie': ['Ooyala'],
162 },
89ef304b
PH
163 # google redirect
164 {
165 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
166 'info_dict': {
167 'id': 'cmQHVoWB5FY',
168 'ext': 'mp4',
169 'upload_date': '20130224',
170 'uploader_id': 'TheVerge',
87830900 171 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
172 'uploader': 'The Verge',
173 'title': 'First Firefox OS phones side-by-side',
174 },
175 'params': {
176 'skip_download': False,
177 }
f55a1f0a 178 },
1b86cc41 179 # embed.ly video
180 {
181 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
182 'info_dict': {
183 'id': '9ODmcdjQcHQ',
184 'ext': 'mp4',
0a5bce56
PH
185 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
186 'upload_date': '20140225',
187 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
188 'uploader': 'Tested',
189 'uploader_id': 'testedcom',
1b86cc41 190 },
191 # No need to test YoutubeIE here
192 'params': {
193 'skip_download': True,
194 },
195 },
60cc4dc4
PH
196 # funnyordie embed
197 {
198 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
199 'info_dict': {
200 'id': '18e820ec3f',
201 'ext': 'mp4',
202 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
203 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 204 },
60cc4dc4 205 },
faa4ea68
S
206 # BBC iPlayer embeds
207 {
208 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
209 'info_dict': {
210 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
211 },
212 'playlist_mincount': 18,
213 },
93d020dd
S
214 # RUTV embed
215 {
216 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
217 'info_dict': {
218 'id': '776940',
219 'ext': 'mp4',
220 'title': 'Охотское море стало целиком российским',
221 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
222 },
223 'params': {
224 # m3u8 download
225 'skip_download': True,
226 },
aab74fa1
PH
227 },
228 # Embedded TED video
229 {
230 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 231 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 232 'info_dict': {
a8eb5a8e 233 'id': '1969',
aab74fa1 234 'ext': 'mp4',
a8eb5a8e
PH
235 'title': 'Hidden miracles of the natural world',
236 'uploader': 'Louie Schwartzberg',
237 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 238 }
60cc4dc4 239 },
5c386252 240 # Embeded Ustream video
241 {
242 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
243 'md5': '27b99cdb639c9b12a79bca876a073417',
244 'info_dict': {
ca6aada4 245 'id': '45734260',
246 'ext': 'flv',
247 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 248 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
249 }
250 },
d95e35d6
S
251 # nowvideo embed hidden behind percent encoding
252 {
253 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
254 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
255 'info_dict': {
256 'id': '06e53103ca9aa',
257 'ext': 'flv',
258 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
259 'description': 'No description',
260 },
0f2a2ba1 261 },
893f8832
PH
262 # arte embed
263 {
264 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
265 'md5': '7653032cbb25bf6c80d80f217055fa43',
266 'info_dict': {
267 'id': '048195-004_PLUS7-F',
268 'ext': 'flv',
269 'title': 'X:enius',
270 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
271 'upload_date': '20140320',
272 },
273 'params': {
274 'skip_download': 'Requires rtmpdump'
275 }
276 },
fa35cdad
PH
277 # Condé Nast embed
278 {
279 'url': 'http://www.wired.com/2014/04/honda-asimo/',
280 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
281 'info_dict': {
282 'id': '53501be369702d3275860000',
283 'ext': 'mp4',
284 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
285 }
ebd3c7b3
PH
286 },
287 # Dailymotion embed
288 {
289 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
290 'md5': '441aeeb82eb72c422c7f14ec533999cd',
291 'info_dict': {
292 'id': 'k2mm4bCdJ6CQ2i7c8o2',
293 'ext': 'mp4',
294 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
295 'uploader': 'Spi0n',
296 },
297 'add_ie': ['Dailymotion'],
2b88feed
PH
298 },
299 # YouTube embed
300 {
301 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
302 'info_dict': {
303 'id': 'FXRb4ykk4S0',
304 'ext': 'mp4',
305 'title': 'The NBL Auction 2014',
306 'uploader': 'BADMINTON England',
307 'uploader_id': 'BADMINTONEvents',
308 'upload_date': '20140603',
309 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
310 },
311 'add_ie': ['Youtube'],
312 'params': {
313 'skip_download': True,
314 }
315 },
c5cd249e
JMF
316 # MTVSercices embed
317 {
318 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
319 'md5': '35727f82f58c76d996fc188f9755b0d5',
320 'info_dict': {
321 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
322 'ext': 'mp4',
323 'title': 'Review',
324 'description': 'Mario\'s life in the fast lane has never looked so good.',
325 },
326 },
61013473 327 # YouTube embed via <data-embed-url="">
328 {
329 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 330 'info_dict': {
a8eb5a8e 331 'id': '4vAffPZIT44',
61013473 332 'ext': 'mp4',
a8eb5a8e 333 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
334 'uploader': 'Gameloft',
335 'uploader_id': 'gameloft',
a8eb5a8e
PH
336 'upload_date': '20140828',
337 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
338 },
339 'params': {
340 'skip_download': True,
61013473 341 }
c8e9a235
PH
342 },
343 # Camtasia studio
344 {
345 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
346 'playlist': [{
347 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
348 'info_dict': {
349 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
350 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
351 'ext': 'flv',
352 'duration': 2235.90,
353 }
354 }, {
355 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
356 'info_dict': {
357 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
358 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
359 'ext': 'flv',
360 'duration': 2235.93,
361 }
362 }],
363 'info_dict': {
364 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
365 }
4d805e06
PH
366 },
367 # Flowplayer
368 {
369 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
370 'md5': '9d65602bf31c6e20014319c7d07fba27',
371 'info_dict': {
372 'id': '5123ea6d5e5a7',
373 'ext': 'mp4',
374 'age_limit': 18,
375 'uploader': 'www.handjobhub.com',
d6d9186f 376 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 377 }
0990305d
PH
378 },
379 # RSS feed
380 {
381 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
382 'info_dict': {
383 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
384 'title': 'Zero Punctuation',
b1b0b1ca 385 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
386 },
387 'playlist_mincount': 11,
22a6f150
PH
388 },
389 # Multiple brightcove videos
390 # https://github.com/rg3/youtube-dl/issues/2283
391 {
392 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
393 'info_dict': {
394 'id': 'always-never',
395 'title': 'Always / Never - The New Yorker',
396 },
397 'playlist_count': 3,
398 'params': {
399 'extract_flat': False,
400 'skip_download': True,
401 }
1a94ff68
S
402 },
403 # MLB embed
404 {
405 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
406 'md5': '96f09a37e44da40dd083e12d9a683327',
407 'info_dict': {
408 'id': '33322633',
409 'ext': 'mp4',
410 'title': 'Ump changes call to ball',
411 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
412 'duration': 48,
413 'timestamp': 1401537900,
414 'upload_date': '20140531',
415 'thumbnail': 're:^https?://.*\.jpg$',
416 },
417 },
746c67d7
NJ
418 # Wistia embed
419 {
420 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
421 'md5': '8788b683c777a5cf25621eaf286d0c23',
422 'info_dict': {
423 'id': '1cfaf6b7ea',
424 'ext': 'mov',
425 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
426 'duration': 643.0,
427 'filesize': 182808282,
428 'uploader': 'education-portal.com',
429 },
430 },
52cffcb1 431 {
432 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
433 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
434 'info_dict': {
435 'id': 'uxjb0lwrcz',
436 'ext': 'mp4',
85d7b765 437 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 438 'duration': 1715.0,
85d7b765 439 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 440 },
52cffcb1 441 },
70b7e3fb
PH
442 # Direct download with broken HEAD
443 {
444 'url': 'http://ai-radio.org:8000/radio.opus',
445 'info_dict': {
446 'id': 'radio',
447 'ext': 'opus',
448 'title': 'radio',
449 },
450 'params': {
451 'skip_download': True, # infinite live stream
452 },
453 'expected_warnings': [
454 r'501.*Not Implemented'
455 ],
ac645ac7
PH
456 },
457 # Soundcloud embed
458 {
459 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
460 'info_dict': {
461 'id': '174391317',
462 'ext': 'mp3',
463 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
464 'uploader': 'Sophos Security',
465 'title': 'Chet Chat 171 - Oct 29, 2014',
466 'upload_date': '20141029',
467 }
af63fed7
PH
468 },
469 # Livestream embed
470 {
471 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
472 'info_dict': {
473 'id': '67864563',
474 'ext': 'flv',
475 'upload_date': '20141112',
476 'title': 'Rosetta #CometLanding webcast HL 10',
477 }
478 },
65f3a228
PH
479 # LazyYT
480 {
481 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
482 'info_dict': {
11e611a7 483 'id': '1986',
65f3a228
PH
484 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
485 },
486 'playlist_mincount': 2,
4e262a88
PH
487 },
488 # Direct link with incorrect MIME type
489 {
490 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
491 'md5': '4ccbebe5f36706d85221f204d7eb5913',
492 'info_dict': {
493 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
494 'id': '5_Lennart_Poettering_-_Systemd',
495 'ext': 'webm',
496 'title': '5_Lennart_Poettering_-_Systemd',
497 'upload_date': '20141120',
498 },
499 'expected_warnings': [
500 'URL could be a direct video link, returning it as such.'
501 ]
42bdd9d0
PH
502 },
503 # Cinchcast embed
504 {
505 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
506 'info_dict': {
507 'id': '7141703',
508 'ext': 'mp3',
509 'upload_date': '20141126',
510 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
511 }
512 },
501f13fb
PH
513 # Cinerama player
514 {
515 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
516 'info_dict': {
517 'id': '730m_DandD_1901_512k',
518 'ext': 'mp4',
519 'uploader': 'www.abc.net.au',
520 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
521 }
796df3c6
S
522 },
523 # embedded viddler video
524 {
525 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
526 'info_dict': {
527 'id': '4d03aad9',
528 'ext': 'mp4',
529 'uploader': 'deadspin',
530 'title': 'WALL-TO-GORTAT',
531 'timestamp': 1422285291,
532 'upload_date': '20150126',
533 },
534 'add_ie': ['Viddler'],
a0f71985 535 },
2051acde
S
536 # Libsyn embed
537 {
538 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
539 'info_dict': {
540 'id': '3377616',
541 'ext': 'mp3',
542 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
543 'description': 'md5:601cb790edd05908957dae8aaa866465',
544 'upload_date': '20150220',
545 },
546 },
a0f71985
PH
547 # jwplayer YouTube
548 {
549 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
550 'info_dict': {
551 'id': 'Mrj4DVp2zeA',
552 'ext': 'mp4',
f37e3f99 553 'upload_date': '20150212',
a0f71985
PH
554 'uploader': 'The National Archives UK',
555 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
556 'uploader_id': 'NationalArchives08',
557 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
558 },
59b8ab58
PH
559 },
560 # rtl.nl embed
561 {
562 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
563 'playlist_mincount': 5,
564 'info_dict': {
565 'id': 'aanslagen-kopenhagen',
566 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
567 }
255fca5e
S
568 },
569 # Zapiks embed
570 {
571 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
572 'info_dict': {
573 'id': '118046',
574 'ext': 'mp4',
575 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
576 }
577 },
e3216b82
NJ
578 # Kaltura embed
579 {
580 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
581 'info_dict': {
582 'id': '1_eergr3h1',
583 'ext': 'mp4',
584 'upload_date': '20150226',
585 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
586 'timestamp': int,
587 'title': 'John Carlson Postgame 2/25/15',
588 },
589 },
135c9c42
S
590 # Eagle.Platform embed (generic URL)
591 {
592 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
593 'info_dict': {
594 'id': '227304',
595 'ext': 'mp4',
596 'title': 'Навальный вышел на свободу',
597 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
598 'thumbnail': 're:^https?://.*\.jpg$',
599 'duration': 87,
600 'view_count': int,
601 'age_limit': 0,
602 },
603 },
d47ae7f6
S
604 # ClipYou (Eagle.Platform) embed (custom URL)
605 {
606 'url': 'http://muz-tv.ru/play/7129/',
607 'info_dict': {
608 'id': '12820',
609 'ext': 'mp4',
610 'title': "'O Sole Mio",
611 'thumbnail': 're:^https?://.*\.jpg$',
612 'duration': 216,
613 'view_count': int,
614 },
615 },
f8388757
S
616 # Pladform embed
617 {
618 'url': 'http://muz-tv.ru/kinozal/view/7400/',
619 'info_dict': {
620 'id': '100183293',
621 'ext': 'mp4',
62259846 622 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
f8388757
S
623 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
624 'thumbnail': 're:^https?://.*\.jpg$',
625 'duration': 694,
626 'age_limit': 0,
627 },
628 },
c798f15b
S
629 # Playwire embed
630 {
631 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
632 'info_dict': {
633 'id': '3519514',
634 'ext': 'mp4',
635 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
636 'thumbnail': 're:^https?://.*\.png$',
637 'duration': 45.115,
638 },
639 },
ad320e9b
NJ
640 # 5min embed
641 {
642 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
643 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
644 'info_dict': {
645 'id': '518726732',
646 'ext': 'mp4',
647 'title': 'Facebook Creates "On This Day" | Crunch Report',
648 },
649 },
dc455a5f
S
650 # SVT embed
651 {
652 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
653 'info_dict': {
654 'id': '2900353',
655 'ext': 'flv',
656 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
657 'duration': 27,
658 'age_limit': 0,
659 },
660 },
76c73715
PH
661 # RSS feed with enclosure
662 {
663 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
664 'info_dict': {
665 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
666 'ext': 'm4v',
667 'upload_date': '20150228',
668 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
669 }
a2edf2e7 670 },
a4257017
S
671 # Crooks and Liars embed
672 {
673 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
674 'info_dict': {
675 'id': '8RUoRhRi',
676 'ext': 'mp4',
677 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
678 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
679 'timestamp': 1428207000,
680 'upload_date': '20150405',
681 'uploader': 'Heather',
682 },
683 },
684 # Crooks and Liars external embed
685 {
686 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
687 'info_dict': {
688 'id': 'MTE3MjUtMzQ2MzA',
689 'ext': 'mp4',
690 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
691 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
692 'timestamp': 1265032391,
693 'upload_date': '20100201',
694 'uploader': 'Heather',
695 },
696 },
facecb84 697 # NBC Sports vplayer embed
a2edf2e7 698 {
facecb84 699 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
a2edf2e7 700 'info_dict': {
facecb84
S
701 'id': 'ln7x1qSThw4k',
702 'ext': 'flv',
703 'title': "PFT Live: New leader in the 'new-look' defense",
704 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
a2edf2e7 705 },
418c5cc3
YCH
706 },
707 # UDN embed
708 {
709 'url': 'http://www.udn.com/news/story/7314/822787',
01c58f84 710 'md5': 'fd2060e988c326991037b9aff9df21a6',
418c5cc3 711 'info_dict': {
01c58f84 712 'id': '300346',
418c5cc3 713 'ext': 'mp4',
01c58f84 714 'title': '中一中男師變性 全校師生力挺',
418c5cc3
YCH
715 'thumbnail': 're:^https?://.*\.jpg$',
716 }
edfcf7ab
YCH
717 },
718 # Ooyala embed
719 {
720 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
721 'info_dict': {
722 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
723 'ext': 'mp4',
724 'description': 'VIDEO: Index/Match versus VLOOKUP.',
725 'title': 'This is what separates the Excel masters from the wannabes',
726 },
727 'params': {
728 # m3u8 downloads
729 'skip_download': True,
730 }
d6fd958c
YCH
731 },
732 # Contains a SMIL manifest
733 {
734 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
735 'info_dict': {
736 'id': 'file',
737 'ext': 'flv',
738 'title': '+ Football: Lottery Champions League Europe',
739 'uploader': 'www.telewebion.com',
740 },
741 'params': {
742 # rtmpe downloads
743 'skip_download': True,
744 }
76c73715 745 }
cfe50f04 746 ]
9b122384 747
9b122384
PH
748 def report_following_redirect(self, new_url):
749 """Report information extraction."""
79649588 750 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 751
4fc946b5
PH
752 def _extract_rss(self, url, video_id, doc):
753 playlist_title = doc.find('./channel/title').text
754 playlist_desc_el = doc.find('./channel/description')
755 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
756
76c73715
PH
757 entries = []
758 for it in doc.findall('./channel/item'):
759 next_url = xpath_text(it, 'link', fatal=False)
760 if not next_url:
761 enclosure_nodes = it.findall('./enclosure')
762 for e in enclosure_nodes:
763 next_url = e.attrib.get('url')
764 if next_url:
765 break
766
767 if not next_url:
768 continue
769
770 entries.append({
771 '_type': 'url',
772 'url': next_url,
773 'title': it.find('title').text,
774 })
4fc946b5
PH
775
776 return {
777 '_type': 'playlist',
778 'id': url,
779 'title': playlist_title,
780 'description': playlist_desc,
781 'entries': entries,
782 }
783
c8e9a235
PH
784 def _extract_camtasia(self, url, video_id, webpage):
785 """ Returns None if no camtasia video can be found. """
786
787 camtasia_cfg = self._search_regex(
788 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
789 webpage, 'camtasia configuration file', default=None)
790 if camtasia_cfg is None:
791 return None
792
793 title = self._html_search_meta('DC.title', webpage, fatal=True)
794
795 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
796 camtasia_cfg = self._download_xml(
797 camtasia_url, video_id,
798 note='Downloading camtasia configuration',
799 errnote='Failed to download camtasia configuration')
800 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
801
802 entries = []
803 for n in fileset_node.getchildren():
804 url_n = n.find('./uri')
805 if url_n is None:
806 continue
807
808 entries.append({
809 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
810 'title': '%s - %s' % (title, n.tag),
811 'url': compat_urlparse.urljoin(url, url_n.text),
812 'duration': float_or_none(n.find('./duration').text),
813 })
814
815 return {
816 '_type': 'playlist',
817 'entries': entries,
818 'title': title,
819 }
820
9b122384 821 def _real_extract(self, url):
ebd3c7b3
PH
822 if url.startswith('//'):
823 return {
824 '_type': 'url',
20991253 825 'url': self.http_scheme() + url,
ebd3c7b3
PH
826 }
827
a7130543
JMF
828 parsed_url = compat_urlparse.urlparse(url)
829 if not parsed_url.scheme:
04b4d394
PH
830 default_search = self._downloader.params.get('default_search')
831 if default_search is None:
1f7ccb90 832 default_search = 'fixup_error'
04b4d394 833
1f7ccb90 834 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
835 if '/' in url:
836 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
837 return self.url_result('http://' + url)
1f7ccb90 838 elif default_search != 'fixup_error':
9c1fc022 839 if default_search == 'auto_warning':
0e67ab0d
PH
840 if re.match(r'^(?:url|URL)$', url):
841 raise ExtractorError(
842 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
843 expected=True)
844 else:
845 self._downloader.report_warning(
7571c02c 846 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 847 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
848
849 if default_search in ('error', 'fixup_error'):
7571c02c 850 raise ExtractorError(
b74e86f4
PH
851 '%r is not a valid URL. '
852 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
853 % (url, url), expected=True)
04b4d394 854 else:
f2f2c0c2
PH
855 if ':' not in default_search:
856 default_search += ':'
04b4d394 857 return self.url_result(default_search + url)
4d54ef20
PH
858
859 url, smuggled_data = unsmuggle_url(url)
860 force_videoid = None
d6e6a422 861 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
862 if smuggled_data and 'force_videoid' in smuggled_data:
863 force_videoid = smuggled_data['force_videoid']
864 video_id = force_videoid
865 else:
866 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 867
79649588 868 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 869
ebab4520 870 head_req = HEADRequest(url)
23be51d8 871 head_response = self._request_webpage(
ebab4520
PH
872 head_req, video_id,
873 note=False, errnote='Could not send HEAD request to %s' % url,
874 fatal=False)
42393ce2 875
23be51d8 876 if head_response is not False:
42393ce2 877 # Check for redirect
23be51d8 878 new_url = head_response.geturl()
42393ce2
PH
879 if url != new_url:
880 self.report_following_redirect(new_url)
4d54ef20
PH
881 if force_videoid:
882 new_url = smuggle_url(
883 new_url, {'force_videoid': force_videoid})
cecaaf3f 884 return self.url_result(new_url)
42393ce2 885
23be51d8
PH
886 full_response = None
887 if head_response is False:
888 full_response = self._request_webpage(url, video_id)
889 head_response = full_response
890
891 # Check for direct link to a video
892 content_type = head_response.headers.get('Content-Type', '')
893 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
894 if m:
895 upload_date = unified_strdate(
896 head_response.headers.get('Last-Modified'))
897 return {
898 'id': video_id,
899 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 900 'direct': True,
23be51d8
PH
901 'formats': [{
902 'format_id': m.group('format_id'),
903 'url': url,
904 'vcodec': 'none' if m.group('type') == 'audio' else None
905 }],
906 'upload_date': upload_date,
907 }
42393ce2 908
d6e6a422
PH
909 if not self._downloader.params.get('test', False) and not is_intentional:
910 self._downloader.report_warning('Falling back on generic information extractor.')
911
4e262a88
PH
912 if not full_response:
913 full_response = self._request_webpage(url, video_id)
914
915 # Maybe it's a direct link to a video?
916 # Be careful not to download the whole thing!
917 first_bytes = full_response.read(512)
61ca9a80 918 if not is_html(first_bytes):
4e262a88
PH
919 self._downloader.report_warning(
920 'URL could be a direct video link, returning it as such.')
921 upload_date = unified_strdate(
922 head_response.headers.get('Last-Modified'))
923 return {
924 'id': video_id,
925 'title': os.path.splitext(url_basename(url))[0],
926 'direct': True,
927 'url': url,
928 'upload_date': upload_date,
929 }
930
931 webpage = self._webpage_read_content(
932 full_response, url, video_id, prefix=first_bytes)
933
9b122384 934 self.report_extraction(video_id)
887c6acd 935
4fc946b5
PH
936 # Is it an RSS feed?
937 try:
bcf89ce6 938 doc = parse_xml(webpage)
4fc946b5
PH
939 if doc.tag == 'rss':
940 return self._extract_rss(url, video_id, doc)
f7300c5c 941 except compat_xml_parse_error:
4fc946b5
PH
942 pass
943
c8e9a235
PH
944 # Is it a Camtasia project?
945 camtasia_res = self._extract_camtasia(url, video_id, webpage)
946 if camtasia_res is not None:
947 return camtasia_res
948
14390730
S
949 # Sometimes embedded video player is hidden behind percent encoding
950 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
951 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
952 webpage = compat_urllib_parse.unquote(webpage)
953
887c6acd
PH
954 # it's tempting to parse this further, but you would
955 # have to take into account all the variations like
956 # Video Title - Site Name
957 # Site Name | Video Title
958 # Video Title - Tagline | Site Name
959 # and so on and so forth; it's just not practical
ef4fd848 960 video_title = self._html_search_regex(
79649588
PH
961 r'(?s)<title>(.*?)</title>', webpage, 'video title',
962 default='video')
ef4fd848 963
4d805e06
PH
964 # Try to detect age limit automatically
965 age_limit = self._rta_search(webpage)
966 # And then there are the jokers who advertise that they use RTA,
967 # but actually don't.
968 AGE_LIMIT_MARKERS = [
969 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
970 ]
971 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
972 age_limit = 18
973
ef4fd848
PH
974 # video uploader is domain name
975 video_uploader = self._search_regex(
79649588 976 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 977
ed2d6a19 978 # Helper method
83992676 979 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 980 urlrs = orderedSet(
83992676 981 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 982 for m in matches)
ed2d6a19
PH
983 return self.playlist_result(
984 urlrs, playlist_id=video_id, playlist_title=video_title)
985
627a91a9 986 # Look for BrightCove:
99877772
PH
987 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
988 if bc_urls:
79649588 989 self.to_screen('Brightcove video detected.')
99877772
PH
990 entries = [{
991 '_type': 'url',
992 'url': smuggle_url(bc_url, {'Referer': url}),
993 'ie_key': 'Brightcove'
994 } for bc_url in bc_urls]
995
996 return {
997 '_type': 'playlist',
998 'title': video_title,
999 'id': video_id,
1000 'entries': entries,
1001 }
cfe50f04 1002
59b8ab58
PH
1003 # Look for embedded rtl.nl player
1004 matches = re.findall(
1005 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1006 webpage)
1007 if matches:
1008 return _playlist_from_matches(matches, ie='RtlNl')
1009
7115ca84 1010 # Look for embedded (iframe) Vimeo player
9d4660ca 1011 mobj = re.search(
15fd51b3 1012 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 1013 if mobj:
15fd51b3 1014 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 1015 surl = smuggle_url(player_url, {'Referer': url})
09a42738 1016 return self.url_result(surl)
7115ca84
PH
1017 # Look for embedded (swf embed) Vimeo player
1018 mobj = re.search(
09a42738 1019 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 1020 if mobj:
09a42738 1021 return self.url_result(mobj.group(1))
7115ca84 1022
53c1d3ef 1023 # Look for embedded YouTube player
1f9da904 1024 matches = re.findall(r'''(?x)
2b88feed
PH
1025 (?:
1026 <iframe[^>]+?src=|
c71dfccc 1027 data-video-url=|
2b88feed 1028 <embed[^>]+?src=|
a7e97f6d
PH
1029 embedSWF\(?:\s*|
1030 new\s+SWFObject\(
2b88feed
PH
1031 )
1032 (["\'])
1bf5423e 1033 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 1034 (?:embed|v|p)/.+?)
1f9da904 1035 \1''', webpage)
887c6acd 1036 if matches:
ed2d6a19 1037 return _playlist_from_matches(
3b2f933b 1038 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 1039
65f3a228
PH
1040 # Look for lazyYT YouTube embed
1041 matches = re.findall(
1042 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1043 if matches:
1044 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1045
355e4fd0
PH
1046 # Look for embedded Dailymotion player
1047 matches = re.findall(
ef4fd848 1048 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 1049 if matches:
ed2d6a19
PH
1050 return _playlist_from_matches(
1051 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 1052
8489578d
NJ
1053 # Look for embedded Dailymotion playlist player (#3822)
1054 m = re.search(
1055 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1056 if m:
1057 playlists = re.findall(
1058 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1059 if playlists:
1060 return _playlist_from_matches(
1061 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1062
ef4fd848
PH
1063 # Look for embedded Wistia player
1064 match = re.search(
281d3f1d 1065 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 1066 if match:
9471c444
NJ
1067 embed_url = self._proto_relative_url(
1068 unescapeHTML(match.group('url')))
ef4fd848
PH
1069 return {
1070 '_type': 'url_transparent',
9471c444 1071 'url': embed_url,
ef4fd848
PH
1072 'ie_key': 'Wistia',
1073 'uploader': video_uploader,
1074 'title': video_title,
1075 'id': video_id,
1076 }
5f6a1245 1077
9471c444 1078 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
1079 if match:
1080 return {
1081 '_type': 'url_transparent',
1082 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1083 'ie_key': 'Wistia',
1084 'uploader': video_uploader,
1085 'title': video_title,
1086 'id': match.group('id')
1087 }
ef4fd848 1088
ee3e63e4 1089 # Look for embedded blip.tv player
0954cd8a
YCH
1090 bliptv_url = BlipTVIE._extract_url(webpage)
1091 if bliptv_url:
1092 return self.url_result(bliptv_url, 'BlipTV')
ee3e63e4 1093
bab19a8e
S
1094 # Look for SVT player
1095 svt_url = SVTIE._extract_url(webpage)
1096 if svt_url:
1097 return self.url_result(svt_url, 'SVT')
1098
fa35cdad
PH
1099 # Look for embedded condenast player
1100 matches = re.findall(
1101 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1102 webpage)
1103 if matches:
1104 return {
1105 '_type': 'playlist',
1106 'entries': [{
1107 '_type': 'url',
1108 'ie_key': 'CondeNast',
1109 'url': ma,
1110 } for ma in matches],
1111 'title': video_title,
1112 'id': video_id,
1113 }
1114
c19f7764
JMF
1115 # Look for Bandcamp pages with custom domain
1116 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1117 if mobj is not None:
1118 burl = unescapeHTML(mobj.group(1))
09804265
JMF
1119 # Don't set the extractor because it can be a track url or an album
1120 return self.url_result(burl)
c19f7764 1121
f25571ff
PH
1122 # Look for embedded Vevo player
1123 mobj = re.search(
1124 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1125 if mobj is not None:
1126 return self.url_result(mobj.group('url'))
796df3c6
S
1127
1128 # Look for embedded Viddler player
cb454b33
S
1129 mobj = re.search(
1130 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1131 webpage)
796df3c6
S
1132 if mobj is not None:
1133 return self.url_result(mobj.group('url'))
f25571ff 1134
3378d67a
S
1135 # Look for NYTimes player
1136 mobj = re.search(
1137 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1138 webpage)
1139 if mobj is not None:
1140 return self.url_result(mobj.group('url'))
1141
cefdf970
S
1142 # Look for Libsyn player
1143 mobj = re.search(
1144 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1145 if mobj is not None:
1146 return self.url_result(mobj.group('url'))
1147
c0d0b01f 1148 # Look for Ooyala videos
cb454b33 1149 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 1150 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
edfcf7ab
YCH
1151 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1152 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 1153 if mobj is not None:
750f9020 1154 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 1155
f076b638 1156 # Look for multiple Ooyala embeds on SBN network websites
1157 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1158 if mobj is not None:
1159 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1160 if embeds:
1161 return _playlist_from_matches(
1162 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1163
aa94a6d3 1164 # Look for Aparat videos
48099643 1165 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1166 if mobj is not None:
1167 return self.url_result(mobj.group(1), 'Aparat')
1168
c93c2ab1 1169 # Look for MPORA videos
c3f51436 1170 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1171 if mobj is not None:
1172 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1173
15c0e8e7 1174 # Look for embedded NovaMov-based player
8f89e687 1175 mobj = re.search(
8dfa187b 1176 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1177 (?P<url>http://(?:(?:embed|www)\.)?
1178 (?:novamov\.com|
1179 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1180 videoweed\.(?:es|com)|
1181 movshare\.(?:net|sx|ag)|
1182 divxstage\.(?:eu|net|ch|co|at|ag))
1183 /embed\.php.+?)\1''', webpage)
8f89e687 1184 if mobj is not None:
15c0e8e7 1185 return self.url_result(mobj.group('url'))
50f56607 1186
9834872b
PH
1187 # Look for embedded Facebook player
1188 mobj = re.search(
db1f3888 1189 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1190 if mobj is not None:
1191 return self.url_result(mobj.group('url'), 'Facebook')
1192
ca97a56e
S
1193 # Look for embedded VK player
1194 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1195 if mobj is not None:
1196 return self.url_result(mobj.group('url'), 'VK')
1197
0364fa8b
S
1198 # Look for embedded ivi player
1199 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1200 if mobj is not None:
1201 return self.url_result(mobj.group('url'), 'Ivi')
1202
db1f3888
PH
1203 # Look for embedded Huffington Post player
1204 mobj = re.search(
c3f51436 1205 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1206 if mobj is not None:
1207 return self.url_result(mobj.group('url'), 'HuffPost')
1208
1b86cc41 1209 # Look for embed.ly
1210 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1211 if mobj is not None:
1212 return self.url_result(mobj.group('url'))
1213 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1214 if mobj is not None:
1215 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1216
60cc4dc4
PH
1217 # Look for funnyordie embed
1218 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1219 if matches:
ed2d6a19
PH
1220 return _playlist_from_matches(
1221 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1222
db546cf8
S
1223 # Look for BBC iPlayer embed
1224 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1225 if matches:
476eae0c 1226 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1227
93d020dd
S
1228 # Look for embedded RUTV player
1229 rutv_url = RUTVIE._extract_url(webpage)
1230 if rutv_url:
1231 return self.url_result(rutv_url, 'RUTV')
1232
d40a3b5b
S
1233 # Look for embedded SportBox player
1234 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1235 if sportbox_urls:
1236 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1237
7e2ede98
JMF
1238 # Look for embedded TED player
1239 mobj = re.search(
d7cc31b6 1240 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1241 if mobj is not None:
1242 return self.url_result(mobj.group('url'), 'TED')
1243
5c386252 1244 # Look for embedded Ustream videos
1245 mobj = re.search(
1246 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1247 if mobj is not None:
1248 return self.url_result(mobj.group('url'), 'Ustream')
1249
893f8832
PH
1250 # Look for embedded arte.tv player
1251 mobj = re.search(
1252 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1253 webpage)
1254 if mobj is not None:
1255 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1256
cb3ac1c6
S
1257 # Look for embedded smotri.com player
1258 smotri_url = SmotriIE._extract_url(webpage)
1259 if smotri_url:
1260 return self.url_result(smotri_url, 'Smotri')
1261
20991253
PH
1262 # Look for embeded soundcloud player
1263 mobj = re.search(
ac645ac7 1264 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1265 webpage)
1266 if mobj is not None:
1267 url = unescapeHTML(mobj.group('url'))
1268 return self.url_result(url)
1269
826ec77f
PH
1270 # Look for embedded vulture.com player
1271 mobj = re.search(
1272 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1273 webpage)
1274 if mobj is not None:
1275 url = unescapeHTML(mobj.group('url'))
1276 return self.url_result(url, ie='Vulture')
1277
c5cd249e
JMF
1278 # Look for embedded mtvservices player
1279 mobj = re.search(
1280 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1281 webpage)
1282 if mobj is not None:
1283 url = unescapeHTML(mobj.group('url'))
1284 return self.url_result(url, ie='MTVServicesEmbedded')
1285
49807b4a
S
1286 # Look for embedded yahoo player
1287 mobj = re.search(
1288 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1289 webpage)
1290 if mobj is not None:
1291 return self.url_result(mobj.group('url'), 'Yahoo')
1292
2ef6fcb5
PH
1293 # Look for embedded sbs.com.au player
1294 mobj = re.search(
e98b8e79
PH
1295 r'''(?x)
1296 (?:
1297 <meta\s+property="og:video"\s+content=|
1298 <iframe[^>]+?src=
1299 )
1300 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1301 webpage)
1302 if mobj is not None:
1303 return self.url_result(mobj.group('url'), 'SBS')
1304
42bdd9d0
PH
1305 # Look for embedded Cinchcast player
1306 mobj = re.search(
1307 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1308 webpage)
1309 if mobj is not None:
1310 return self.url_result(mobj.group('url'), 'Cinchcast')
1311
1a94ff68 1312 mobj = re.search(
5263cdfc 1313 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68 1314 webpage)
8001607e
YCH
1315 if not mobj:
1316 mobj = re.search(
1317 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1318 webpage)
1a94ff68
S
1319 if mobj is not None:
1320 return self.url_result(mobj.group('url'), 'MLB')
1321
1419fafd
S
1322 mobj = re.search(
1323 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1324 webpage)
1325 if mobj is not None:
1326 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1327
af63fed7
PH
1328 mobj = re.search(
1329 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1330 webpage)
1331 if mobj is not None:
1332 return self.url_result(mobj.group('url'), 'Livestream')
1333
255fca5e
S
1334 # Look for Zapiks embed
1335 mobj = re.search(
1336 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1337 if mobj is not None:
1338 return self.url_result(mobj.group('url'), 'Zapiks')
1339
e3216b82
NJ
1340 # Look for Kaltura embeds
1341 mobj = re.search(
1342 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1343 if mobj is not None:
1344 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1345
135c9c42
S
1346 # Look for Eagle.Platform embeds
1347 mobj = re.search(
1348 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1349 if mobj is not None:
1350 return self.url_result(mobj.group('url'), 'EaglePlatform')
1351
d47ae7f6
S
1352 # Look for ClipYou (uses Eagle.Platform) embeds
1353 mobj = re.search(
1354 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1355 if mobj is not None:
1356 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1357
f8388757
S
1358 # Look for Pladform embeds
1359 mobj = re.search(
1360 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1361 if mobj is not None:
1362 return self.url_result(mobj.group('url'), 'Pladform')
1363
2dcc114f
S
1364 # Look for Playwire embeds
1365 mobj = re.search(
1366 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1367 if mobj is not None:
1368 return self.url_result(mobj.group('url'))
1369
ad320e9b
NJ
1370 # Look for 5min embeds
1371 mobj = re.search(
1372 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1373 if mobj is not None:
1374 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1375
18153f1b
S
1376 # Look for Crooks and Liars embeds
1377 mobj = re.search(
1378 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1379 if mobj is not None:
1380 return self.url_result(mobj.group('url'))
1381
a2edf2e7
YCH
1382 # Look for NBC Sports VPlayer embeds
1383 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1384 if nbc_sports_url:
1385 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1386
418c5cc3
YCH
1387 # Look for UDN embeds
1388 mobj = re.search(
1389 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1390 if mobj is not None:
1391 return self.url_result(
0a160363 1392 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
418c5cc3 1393
2fe1b5bd
YCH
1394 # Look for Senate ISVP iframe
1395 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1396 if senate_isvp_url:
1397 return self.url_result(surl, 'SenateISVP')
1398
ced659bb 1399 def check_video(vurl):
a0f71985
PH
1400 if YoutubeIE.suitable(vurl):
1401 return True
ced659bb
S
1402 vpath = compat_urlparse.urlparse(vurl).path
1403 vext = determine_ext(vpath)
1404 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1405
1406 def filter_video(urls):
1407 return list(filter(check_video, urls))
1408
9b122384 1409 # Start with something easy: JW Player in SWFObject
ced659bb 1410 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1411 if not found:
d981cef6 1412 # Look for gorilla-vid style embedding
ced659bb 1413 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1414 (?:
1415 jw_plugins|
1416 JWPlayerOptions|
1417 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1418 )
a0f71985
PH
1419 .*?
1420 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1421 if not found:
9b122384 1422 # Broaden the search a little bit
ced659bb 1423 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1424 if not found:
1425 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1426 found = filter_video(re.findall(
1427 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1428 if not found:
1429 # Flow player
ced659bb 1430 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1431 flowplayer\("[^"]+",\s*
1432 \{[^}]+?\}\s*,
52585fd6 1433 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1434 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1435 ''', webpage))
501f13fb
PH
1436 if not found:
1437 # Cinerama player
1438 found = re.findall(
1439 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1440 if not found:
9b122384 1441 # Try to find twitter cards info
ced659bb
S
1442 found = filter_video(re.findall(
1443 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1444 if not found:
9b122384
PH
1445 # We look for Open Graph info:
1446 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1447 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1448 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1449 if m_video_type is not None:
ced659bb 1450 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1451 if not found:
7fea7156 1452 # HTML5 video
9b32eca3 1453 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1454 if not found:
ed9a25dd 1455 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
a5a45015 1456 found = re.search(
89ef304b 1457 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
ed9a25dd 1458 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
89ef304b 1459 webpage)
84f81016
S
1460 if not found:
1461 # Look also in Refresh HTTP header
1462 refresh_header = head_response.headers.get('Refresh')
1463 if refresh_header:
ed9a25dd 1464 found = re.search(REDIRECT_REGEX, refresh_header)
b30b8698 1465 if found:
406224be 1466 new_url = compat_urlparse.urljoin(url, found.group(1))
89ef304b
PH
1467 self.report_following_redirect(new_url)
1468 return {
1469 '_type': 'url',
1470 'url': new_url,
1471 }
b30b8698 1472 if not found:
416c7fcb 1473 raise UnsupportedError(url)
9b122384 1474
b30b8698
PH
1475 entries = []
1476 for video_url in found:
1477 video_url = compat_urlparse.urljoin(url, video_url)
1478 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1479
b30b8698
PH
1480 # Sometimes, jwplayer extraction will result in a YouTube URL
1481 if YoutubeIE.suitable(video_url):
1482 entries.append(self.url_result(video_url, 'Youtube'))
1483 continue
9b122384 1484
b30b8698
PH
1485 # here's a fun little line of code for you:
1486 video_id = os.path.splitext(video_id)[0]
fc9713a1 1487
d6fd958c
YCH
1488 if determine_ext(video_url) == 'smil':
1489 entries.append({
1490 'id': video_id,
1491 'formats': self._extract_smil_formats(video_url, video_id),
1492 'uploader': video_uploader,
1493 'title': video_title,
1494 'age_limit': age_limit,
1495 })
1496 else:
1497 entries.append({
1498 'id': video_id,
1499 'url': video_url,
1500 'uploader': video_uploader,
1501 'title': video_title,
1502 'age_limit': age_limit,
1503 })
b30b8698
PH
1504
1505 if len(entries) == 1:
669f0e7c 1506 return entries[0]
b30b8698
PH
1507 else:
1508 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1509 # 'url' results don't have a title
1510 if e.get('title') is not None:
1511 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1512 return {
1513 '_type': 'playlist',
1514 'entries': entries,
1515 }