]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[NBCSports] Move imports alphabetically
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
76c73715 29 xpath_text,
9b122384 30)
cfe50f04 31from .brightcove import BrightcoveIE
c0d0b01f 32from .ooyala import OoyalaIE
93d020dd 33from .rutv import RUTVIE
cb3ac1c6 34from .smotri import SmotriIE
1419fafd 35from .condenast import CondeNastIE
9b122384 36
0838239e 37
9b122384 38class GenericIE(InfoExtractor):
79649588 39 IE_DESC = 'Generic downloader that works on some sites'
9b122384 40 _VALID_URL = r'.*'
79649588 41 IE_NAME = 'generic'
cfe50f04
JMF
42 _TESTS = [
43 {
79649588 44 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 45 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 46 'info_dict': {
d360a146
S
47 'id': '13601338388002',
48 'ext': 'mp4',
79649588
PH
49 'uploader': 'www.hodiho.fr',
50 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
51 }
52 },
c19f7764
JMF
53 # bandcamp page with custom domain
54 {
79649588
PH
55 'add_ie': ['Bandcamp'],
56 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 57 'info_dict': {
fd50bf62
S
58 'id': '3235767654',
59 'ext': 'mp3',
79649588
PH
60 'title': 'The Pony Mash',
61 'uploader': 'M_Pallante',
c19f7764 62 },
79649588 63 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 64 },
eeb165e6 65 # embedded brightcove video
dd5bcdc4
JMF
66 # it also tests brightcove videos that need to set the 'Referer' in the
67 # http requests
eeb165e6 68 {
79649588
PH
69 'add_ie': ['Brightcove'],
70 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
71 'info_dict': {
72 'id': '2765128793001',
73 'ext': 'mp4',
74 'title': 'Le cours de bourse : l’analyse technique',
75 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
76 'uploader': 'BFM BUSINESS',
eeb165e6 77 },
79649588
PH
78 'params': {
79 'skip_download': True,
eeb165e6
JMF
80 },
81 },
17ab4d3b
PH
82 {
83 # https://github.com/rg3/youtube-dl/issues/2253
84 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
85 'md5': '0ba9446db037002366bab3b3eb30c88c',
86 'info_dict': {
fd50bf62
S
87 'id': '3101154703001',
88 'ext': 'mp4',
17ab4d3b
PH
89 'title': 'Still no power',
90 'uploader': 'thestar.com',
91 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
92 },
93 'add_ie': ['Brightcove'],
94 },
0479c625
S
95 {
96 'url': 'http://www.championat.com/video/football/v/87/87499.html',
97 'md5': 'fb973ecf6e4a78a67453647444222983',
98 'info_dict': {
99 'id': '3414141473001',
100 'ext': 'mp4',
101 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
102 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
103 'uploader': 'Championat',
104 },
105 },
bdf97017 106 {
37aab278 107 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
108 'add_ie': ['Brightcove'],
109 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
110 'info_dict': {
111 'id': '3866516442001',
37aab278 112 'ext': 'mp4',
bdf97017
NJ
113 'title': 'Leer mij vrouwen kennen: Aflevering 1',
114 'description': 'Leer mij vrouwen kennen: Aflevering 1',
115 'uploader': 'SBS Broadcasting',
116 },
37aab278 117 'skip': 'Restricted to Netherlands',
bdf97017 118 'params': {
37aab278 119 'skip_download': True, # m3u8 download
bdf97017
NJ
120 },
121 },
42393ce2
PH
122 # Direct link to a video
123 {
79649588 124 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
125 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
126 'info_dict': {
127 'id': 'trailer',
89ef304b 128 'ext': 'mp4',
79649588
PH
129 'title': 'trailer',
130 'upload_date': '20100513',
42393ce2 131 }
c0d0b01f
JMF
132 },
133 # ooyala video
134 {
79649588 135 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 136 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
137 'info_dict': {
138 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
139 'ext': 'mp4',
3486df38 140 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 141 },
87830900 142 'add_ie': ['Ooyala'],
c0d0b01f 143 },
f076b638 144 # multiple ooyala embeds on SBN network websites
145 {
146 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
147 'info_dict': {
148 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
150 },
151 'playlist_mincount': 3,
152 'params': {
153 'skip_download': True,
154 },
155 'add_ie': ['Ooyala'],
156 },
89ef304b
PH
157 # google redirect
158 {
159 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
160 'info_dict': {
161 'id': 'cmQHVoWB5FY',
162 'ext': 'mp4',
163 'upload_date': '20130224',
164 'uploader_id': 'TheVerge',
87830900 165 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
166 'uploader': 'The Verge',
167 'title': 'First Firefox OS phones side-by-side',
168 },
169 'params': {
170 'skip_download': False,
171 }
f55a1f0a 172 },
1b86cc41 173 # embed.ly video
174 {
175 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
176 'info_dict': {
177 'id': '9ODmcdjQcHQ',
178 'ext': 'mp4',
0a5bce56
PH
179 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
180 'upload_date': '20140225',
181 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
182 'uploader': 'Tested',
183 'uploader_id': 'testedcom',
1b86cc41 184 },
185 # No need to test YoutubeIE here
186 'params': {
187 'skip_download': True,
188 },
189 },
60cc4dc4
PH
190 # funnyordie embed
191 {
192 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
193 'info_dict': {
194 'id': '18e820ec3f',
195 'ext': 'mp4',
196 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
197 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 198 },
60cc4dc4 199 },
faa4ea68
S
200 # BBC iPlayer embeds
201 {
202 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
203 'info_dict': {
204 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
205 },
206 'playlist_mincount': 18,
207 },
93d020dd
S
208 # RUTV embed
209 {
210 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
211 'info_dict': {
212 'id': '776940',
213 'ext': 'mp4',
214 'title': 'Охотское море стало целиком российским',
215 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
216 },
217 'params': {
218 # m3u8 download
219 'skip_download': True,
220 },
aab74fa1
PH
221 },
222 # Embedded TED video
223 {
224 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 225 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 226 'info_dict': {
a8eb5a8e 227 'id': '1969',
aab74fa1 228 'ext': 'mp4',
a8eb5a8e
PH
229 'title': 'Hidden miracles of the natural world',
230 'uploader': 'Louie Schwartzberg',
231 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 232 }
60cc4dc4 233 },
5c386252 234 # Embeded Ustream video
235 {
236 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
237 'md5': '27b99cdb639c9b12a79bca876a073417',
238 'info_dict': {
ca6aada4 239 'id': '45734260',
240 'ext': 'flv',
241 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 242 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
243 }
244 },
d95e35d6
S
245 # nowvideo embed hidden behind percent encoding
246 {
247 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
248 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
249 'info_dict': {
250 'id': '06e53103ca9aa',
251 'ext': 'flv',
252 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
253 'description': 'No description',
254 },
0f2a2ba1 255 },
893f8832
PH
256 # arte embed
257 {
258 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
259 'md5': '7653032cbb25bf6c80d80f217055fa43',
260 'info_dict': {
261 'id': '048195-004_PLUS7-F',
262 'ext': 'flv',
263 'title': 'X:enius',
264 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
265 'upload_date': '20140320',
266 },
267 'params': {
268 'skip_download': 'Requires rtmpdump'
269 }
270 },
fa35cdad
PH
271 # Condé Nast embed
272 {
273 'url': 'http://www.wired.com/2014/04/honda-asimo/',
274 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
275 'info_dict': {
276 'id': '53501be369702d3275860000',
277 'ext': 'mp4',
278 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
279 }
ebd3c7b3
PH
280 },
281 # Dailymotion embed
282 {
283 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
284 'md5': '441aeeb82eb72c422c7f14ec533999cd',
285 'info_dict': {
286 'id': 'k2mm4bCdJ6CQ2i7c8o2',
287 'ext': 'mp4',
288 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
289 'uploader': 'Spi0n',
290 },
291 'add_ie': ['Dailymotion'],
2b88feed
PH
292 },
293 # YouTube embed
294 {
295 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
296 'info_dict': {
297 'id': 'FXRb4ykk4S0',
298 'ext': 'mp4',
299 'title': 'The NBL Auction 2014',
300 'uploader': 'BADMINTON England',
301 'uploader_id': 'BADMINTONEvents',
302 'upload_date': '20140603',
303 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
304 },
305 'add_ie': ['Youtube'],
306 'params': {
307 'skip_download': True,
308 }
309 },
c5cd249e
JMF
310 # MTVSercices embed
311 {
312 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
313 'md5': '35727f82f58c76d996fc188f9755b0d5',
314 'info_dict': {
315 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
316 'ext': 'mp4',
317 'title': 'Review',
318 'description': 'Mario\'s life in the fast lane has never looked so good.',
319 },
320 },
61013473 321 # YouTube embed via <data-embed-url="">
322 {
323 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 324 'info_dict': {
a8eb5a8e 325 'id': '4vAffPZIT44',
61013473 326 'ext': 'mp4',
a8eb5a8e 327 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
328 'uploader': 'Gameloft',
329 'uploader_id': 'gameloft',
a8eb5a8e
PH
330 'upload_date': '20140828',
331 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
332 },
333 'params': {
334 'skip_download': True,
61013473 335 }
c8e9a235
PH
336 },
337 # Camtasia studio
338 {
339 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
340 'playlist': [{
341 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
342 'info_dict': {
343 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
344 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
345 'ext': 'flv',
346 'duration': 2235.90,
347 }
348 }, {
349 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
350 'info_dict': {
351 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
352 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
353 'ext': 'flv',
354 'duration': 2235.93,
355 }
356 }],
357 'info_dict': {
358 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
359 }
4d805e06
PH
360 },
361 # Flowplayer
362 {
363 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
364 'md5': '9d65602bf31c6e20014319c7d07fba27',
365 'info_dict': {
366 'id': '5123ea6d5e5a7',
367 'ext': 'mp4',
368 'age_limit': 18,
369 'uploader': 'www.handjobhub.com',
d6d9186f 370 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 371 }
0990305d
PH
372 },
373 # RSS feed
374 {
375 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
376 'info_dict': {
377 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'title': 'Zero Punctuation',
b1b0b1ca 379 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
380 },
381 'playlist_mincount': 11,
22a6f150
PH
382 },
383 # Multiple brightcove videos
384 # https://github.com/rg3/youtube-dl/issues/2283
385 {
386 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
387 'info_dict': {
388 'id': 'always-never',
389 'title': 'Always / Never - The New Yorker',
390 },
391 'playlist_count': 3,
392 'params': {
393 'extract_flat': False,
394 'skip_download': True,
395 }
1a94ff68
S
396 },
397 # MLB embed
398 {
399 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
400 'md5': '96f09a37e44da40dd083e12d9a683327',
401 'info_dict': {
402 'id': '33322633',
403 'ext': 'mp4',
404 'title': 'Ump changes call to ball',
405 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
406 'duration': 48,
407 'timestamp': 1401537900,
408 'upload_date': '20140531',
409 'thumbnail': 're:^https?://.*\.jpg$',
410 },
411 },
746c67d7
NJ
412 # Wistia embed
413 {
414 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
415 'md5': '8788b683c777a5cf25621eaf286d0c23',
416 'info_dict': {
417 'id': '1cfaf6b7ea',
418 'ext': 'mov',
419 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
420 'duration': 643.0,
421 'filesize': 182808282,
422 'uploader': 'education-portal.com',
423 },
424 },
52cffcb1 425 {
426 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
427 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
428 'info_dict': {
429 'id': 'uxjb0lwrcz',
430 'ext': 'mp4',
85d7b765 431 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 432 'duration': 1715.0,
85d7b765 433 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 434 },
52cffcb1 435 },
70b7e3fb
PH
436 # Direct download with broken HEAD
437 {
438 'url': 'http://ai-radio.org:8000/radio.opus',
439 'info_dict': {
440 'id': 'radio',
441 'ext': 'opus',
442 'title': 'radio',
443 },
444 'params': {
445 'skip_download': True, # infinite live stream
446 },
447 'expected_warnings': [
448 r'501.*Not Implemented'
449 ],
ac645ac7
PH
450 },
451 # Soundcloud embed
452 {
453 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
454 'info_dict': {
455 'id': '174391317',
456 'ext': 'mp3',
457 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
458 'uploader': 'Sophos Security',
459 'title': 'Chet Chat 171 - Oct 29, 2014',
460 'upload_date': '20141029',
461 }
af63fed7
PH
462 },
463 # Livestream embed
464 {
465 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
466 'info_dict': {
467 'id': '67864563',
468 'ext': 'flv',
469 'upload_date': '20141112',
470 'title': 'Rosetta #CometLanding webcast HL 10',
471 }
472 },
65f3a228
PH
473 # LazyYT
474 {
475 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
476 'info_dict': {
11e611a7 477 'id': '1986',
65f3a228
PH
478 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
479 },
480 'playlist_mincount': 2,
4e262a88
PH
481 },
482 # Direct link with incorrect MIME type
483 {
484 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
485 'md5': '4ccbebe5f36706d85221f204d7eb5913',
486 'info_dict': {
487 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
488 'id': '5_Lennart_Poettering_-_Systemd',
489 'ext': 'webm',
490 'title': '5_Lennart_Poettering_-_Systemd',
491 'upload_date': '20141120',
492 },
493 'expected_warnings': [
494 'URL could be a direct video link, returning it as such.'
495 ]
42bdd9d0
PH
496 },
497 # Cinchcast embed
498 {
499 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
500 'info_dict': {
501 'id': '7141703',
502 'ext': 'mp3',
503 'upload_date': '20141126',
504 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
505 }
506 },
501f13fb
PH
507 # Cinerama player
508 {
509 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
510 'info_dict': {
511 'id': '730m_DandD_1901_512k',
512 'ext': 'mp4',
513 'uploader': 'www.abc.net.au',
514 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
515 }
796df3c6
S
516 },
517 # embedded viddler video
518 {
519 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
520 'info_dict': {
521 'id': '4d03aad9',
522 'ext': 'mp4',
523 'uploader': 'deadspin',
524 'title': 'WALL-TO-GORTAT',
525 'timestamp': 1422285291,
526 'upload_date': '20150126',
527 },
528 'add_ie': ['Viddler'],
a0f71985 529 },
2051acde
S
530 # Libsyn embed
531 {
532 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
533 'info_dict': {
534 'id': '3377616',
535 'ext': 'mp3',
536 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
537 'description': 'md5:601cb790edd05908957dae8aaa866465',
538 'upload_date': '20150220',
539 },
540 },
a0f71985
PH
541 # jwplayer YouTube
542 {
543 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
544 'info_dict': {
545 'id': 'Mrj4DVp2zeA',
546 'ext': 'mp4',
f37e3f99 547 'upload_date': '20150212',
a0f71985
PH
548 'uploader': 'The National Archives UK',
549 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
550 'uploader_id': 'NationalArchives08',
551 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
552 },
59b8ab58
PH
553 },
554 # rtl.nl embed
555 {
556 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
557 'playlist_mincount': 5,
558 'info_dict': {
559 'id': 'aanslagen-kopenhagen',
560 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
561 }
255fca5e
S
562 },
563 # Zapiks embed
564 {
565 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
566 'info_dict': {
567 'id': '118046',
568 'ext': 'mp4',
569 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
570 }
571 },
e3216b82
NJ
572 # Kaltura embed
573 {
574 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
575 'info_dict': {
576 'id': '1_eergr3h1',
577 'ext': 'mp4',
578 'upload_date': '20150226',
579 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
580 'timestamp': int,
581 'title': 'John Carlson Postgame 2/25/15',
582 },
583 },
135c9c42
S
584 # Eagle.Platform embed (generic URL)
585 {
586 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
587 'info_dict': {
588 'id': '227304',
589 'ext': 'mp4',
590 'title': 'Навальный вышел на свободу',
591 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
592 'thumbnail': 're:^https?://.*\.jpg$',
593 'duration': 87,
594 'view_count': int,
595 'age_limit': 0,
596 },
597 },
d47ae7f6
S
598 # ClipYou (Eagle.Platform) embed (custom URL)
599 {
600 'url': 'http://muz-tv.ru/play/7129/',
601 'info_dict': {
602 'id': '12820',
603 'ext': 'mp4',
604 'title': "'O Sole Mio",
605 'thumbnail': 're:^https?://.*\.jpg$',
606 'duration': 216,
607 'view_count': int,
608 },
609 },
f8388757
S
610 # Pladform embed
611 {
612 'url': 'http://muz-tv.ru/kinozal/view/7400/',
613 'info_dict': {
614 'id': '100183293',
615 'ext': 'mp4',
616 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
617 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
618 'thumbnail': 're:^https?://.*\.jpg$',
619 'duration': 694,
620 'age_limit': 0,
621 },
622 },
ad320e9b
NJ
623 # 5min embed
624 {
625 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
626 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
627 'info_dict': {
628 'id': '518726732',
629 'ext': 'mp4',
630 'title': 'Facebook Creates "On This Day" | Crunch Report',
631 },
632 },
76c73715
PH
633 # RSS feed with enclosure
634 {
635 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
636 'info_dict': {
637 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
638 'ext': 'm4v',
639 'upload_date': '20150228',
640 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
641 }
642 }
cfe50f04 643 ]
9b122384 644
9b122384
PH
645 def report_following_redirect(self, new_url):
646 """Report information extraction."""
79649588 647 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 648
4fc946b5
PH
649 def _extract_rss(self, url, video_id, doc):
650 playlist_title = doc.find('./channel/title').text
651 playlist_desc_el = doc.find('./channel/description')
652 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
653
76c73715
PH
654 entries = []
655 for it in doc.findall('./channel/item'):
656 next_url = xpath_text(it, 'link', fatal=False)
657 if not next_url:
658 enclosure_nodes = it.findall('./enclosure')
659 for e in enclosure_nodes:
660 next_url = e.attrib.get('url')
661 if next_url:
662 break
663
664 if not next_url:
665 continue
666
667 entries.append({
668 '_type': 'url',
669 'url': next_url,
670 'title': it.find('title').text,
671 })
4fc946b5
PH
672
673 return {
674 '_type': 'playlist',
675 'id': url,
676 'title': playlist_title,
677 'description': playlist_desc,
678 'entries': entries,
679 }
680
c8e9a235
PH
681 def _extract_camtasia(self, url, video_id, webpage):
682 """ Returns None if no camtasia video can be found. """
683
684 camtasia_cfg = self._search_regex(
685 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
686 webpage, 'camtasia configuration file', default=None)
687 if camtasia_cfg is None:
688 return None
689
690 title = self._html_search_meta('DC.title', webpage, fatal=True)
691
692 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
693 camtasia_cfg = self._download_xml(
694 camtasia_url, video_id,
695 note='Downloading camtasia configuration',
696 errnote='Failed to download camtasia configuration')
697 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
698
699 entries = []
700 for n in fileset_node.getchildren():
701 url_n = n.find('./uri')
702 if url_n is None:
703 continue
704
705 entries.append({
706 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
707 'title': '%s - %s' % (title, n.tag),
708 'url': compat_urlparse.urljoin(url, url_n.text),
709 'duration': float_or_none(n.find('./duration').text),
710 })
711
712 return {
713 '_type': 'playlist',
714 'entries': entries,
715 'title': title,
716 }
717
9b122384 718 def _real_extract(self, url):
ebd3c7b3
PH
719 if url.startswith('//'):
720 return {
721 '_type': 'url',
20991253 722 'url': self.http_scheme() + url,
ebd3c7b3
PH
723 }
724
a7130543
JMF
725 parsed_url = compat_urlparse.urlparse(url)
726 if not parsed_url.scheme:
04b4d394
PH
727 default_search = self._downloader.params.get('default_search')
728 if default_search is None:
1f7ccb90 729 default_search = 'fixup_error'
04b4d394 730
1f7ccb90 731 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
732 if '/' in url:
733 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
734 return self.url_result('http://' + url)
1f7ccb90 735 elif default_search != 'fixup_error':
9c1fc022 736 if default_search == 'auto_warning':
0e67ab0d
PH
737 if re.match(r'^(?:url|URL)$', url):
738 raise ExtractorError(
739 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
740 expected=True)
741 else:
742 self._downloader.report_warning(
7571c02c 743 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 744 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
745
746 if default_search in ('error', 'fixup_error'):
7571c02c 747 raise ExtractorError(
b74e86f4
PH
748 '%r is not a valid URL. '
749 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
750 % (url, url), expected=True)
04b4d394 751 else:
f2f2c0c2
PH
752 if ':' not in default_search:
753 default_search += ':'
04b4d394 754 return self.url_result(default_search + url)
4d54ef20
PH
755
756 url, smuggled_data = unsmuggle_url(url)
757 force_videoid = None
d6e6a422 758 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
759 if smuggled_data and 'force_videoid' in smuggled_data:
760 force_videoid = smuggled_data['force_videoid']
761 video_id = force_videoid
762 else:
763 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 764
79649588 765 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 766
ebab4520 767 head_req = HEADRequest(url)
23be51d8 768 head_response = self._request_webpage(
ebab4520
PH
769 head_req, video_id,
770 note=False, errnote='Could not send HEAD request to %s' % url,
771 fatal=False)
42393ce2 772
23be51d8 773 if head_response is not False:
42393ce2 774 # Check for redirect
23be51d8 775 new_url = head_response.geturl()
42393ce2
PH
776 if url != new_url:
777 self.report_following_redirect(new_url)
4d54ef20
PH
778 if force_videoid:
779 new_url = smuggle_url(
780 new_url, {'force_videoid': force_videoid})
cecaaf3f 781 return self.url_result(new_url)
42393ce2 782
23be51d8
PH
783 full_response = None
784 if head_response is False:
785 full_response = self._request_webpage(url, video_id)
786 head_response = full_response
787
788 # Check for direct link to a video
789 content_type = head_response.headers.get('Content-Type', '')
790 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
791 if m:
792 upload_date = unified_strdate(
793 head_response.headers.get('Last-Modified'))
794 return {
795 'id': video_id,
796 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 797 'direct': True,
23be51d8
PH
798 'formats': [{
799 'format_id': m.group('format_id'),
800 'url': url,
801 'vcodec': 'none' if m.group('type') == 'audio' else None
802 }],
803 'upload_date': upload_date,
804 }
42393ce2 805
d6e6a422
PH
806 if not self._downloader.params.get('test', False) and not is_intentional:
807 self._downloader.report_warning('Falling back on generic information extractor.')
808
4e262a88
PH
809 if not full_response:
810 full_response = self._request_webpage(url, video_id)
811
812 # Maybe it's a direct link to a video?
813 # Be careful not to download the whole thing!
814 first_bytes = full_response.read(512)
61ca9a80 815 if not is_html(first_bytes):
4e262a88
PH
816 self._downloader.report_warning(
817 'URL could be a direct video link, returning it as such.')
818 upload_date = unified_strdate(
819 head_response.headers.get('Last-Modified'))
820 return {
821 'id': video_id,
822 'title': os.path.splitext(url_basename(url))[0],
823 'direct': True,
824 'url': url,
825 'upload_date': upload_date,
826 }
827
828 webpage = self._webpage_read_content(
829 full_response, url, video_id, prefix=first_bytes)
830
9b122384 831 self.report_extraction(video_id)
887c6acd 832
4fc946b5
PH
833 # Is it an RSS feed?
834 try:
bcf89ce6 835 doc = parse_xml(webpage)
4fc946b5
PH
836 if doc.tag == 'rss':
837 return self._extract_rss(url, video_id, doc)
f7300c5c 838 except compat_xml_parse_error:
4fc946b5
PH
839 pass
840
c8e9a235
PH
841 # Is it a Camtasia project?
842 camtasia_res = self._extract_camtasia(url, video_id, webpage)
843 if camtasia_res is not None:
844 return camtasia_res
845
14390730
S
846 # Sometimes embedded video player is hidden behind percent encoding
847 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
848 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
849 webpage = compat_urllib_parse.unquote(webpage)
850
887c6acd
PH
851 # it's tempting to parse this further, but you would
852 # have to take into account all the variations like
853 # Video Title - Site Name
854 # Site Name | Video Title
855 # Video Title - Tagline | Site Name
856 # and so on and so forth; it's just not practical
ef4fd848 857 video_title = self._html_search_regex(
79649588
PH
858 r'(?s)<title>(.*?)</title>', webpage, 'video title',
859 default='video')
ef4fd848 860
4d805e06
PH
861 # Try to detect age limit automatically
862 age_limit = self._rta_search(webpage)
863 # And then there are the jokers who advertise that they use RTA,
864 # but actually don't.
865 AGE_LIMIT_MARKERS = [
866 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
867 ]
868 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
869 age_limit = 18
870
ef4fd848
PH
871 # video uploader is domain name
872 video_uploader = self._search_regex(
79649588 873 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 874
ed2d6a19 875 # Helper method
83992676 876 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 877 urlrs = orderedSet(
83992676 878 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 879 for m in matches)
ed2d6a19
PH
880 return self.playlist_result(
881 urlrs, playlist_id=video_id, playlist_title=video_title)
882
627a91a9 883 # Look for BrightCove:
99877772
PH
884 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
885 if bc_urls:
79649588 886 self.to_screen('Brightcove video detected.')
99877772
PH
887 entries = [{
888 '_type': 'url',
889 'url': smuggle_url(bc_url, {'Referer': url}),
890 'ie_key': 'Brightcove'
891 } for bc_url in bc_urls]
892
893 return {
894 '_type': 'playlist',
895 'title': video_title,
896 'id': video_id,
897 'entries': entries,
898 }
cfe50f04 899
59b8ab58
PH
900 # Look for embedded rtl.nl player
901 matches = re.findall(
902 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
903 webpage)
904 if matches:
905 return _playlist_from_matches(matches, ie='RtlNl')
906
7115ca84 907 # Look for embedded (iframe) Vimeo player
9d4660ca 908 mobj = re.search(
15fd51b3 909 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 910 if mobj:
15fd51b3 911 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 912 surl = smuggle_url(player_url, {'Referer': url})
09a42738 913 return self.url_result(surl)
7115ca84
PH
914 # Look for embedded (swf embed) Vimeo player
915 mobj = re.search(
09a42738 916 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 917 if mobj:
09a42738 918 return self.url_result(mobj.group(1))
7115ca84 919
53c1d3ef 920 # Look for embedded YouTube player
1f9da904 921 matches = re.findall(r'''(?x)
2b88feed
PH
922 (?:
923 <iframe[^>]+?src=|
c71dfccc 924 data-video-url=|
2b88feed 925 <embed[^>]+?src=|
a7e97f6d
PH
926 embedSWF\(?:\s*|
927 new\s+SWFObject\(
2b88feed
PH
928 )
929 (["\'])
1bf5423e 930 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 931 (?:embed|v|p)/.+?)
1f9da904 932 \1''', webpage)
887c6acd 933 if matches:
ed2d6a19 934 return _playlist_from_matches(
3b2f933b 935 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 936
65f3a228
PH
937 # Look for lazyYT YouTube embed
938 matches = re.findall(
939 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
940 if matches:
941 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
942
355e4fd0
PH
943 # Look for embedded Dailymotion player
944 matches = re.findall(
ef4fd848 945 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 946 if matches:
ed2d6a19
PH
947 return _playlist_from_matches(
948 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 949
8489578d
NJ
950 # Look for embedded Dailymotion playlist player (#3822)
951 m = re.search(
952 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
953 if m:
954 playlists = re.findall(
955 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
956 if playlists:
957 return _playlist_from_matches(
958 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
959
ef4fd848
PH
960 # Look for embedded Wistia player
961 match = re.search(
281d3f1d 962 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 963 if match:
9471c444
NJ
964 embed_url = self._proto_relative_url(
965 unescapeHTML(match.group('url')))
ef4fd848
PH
966 return {
967 '_type': 'url_transparent',
9471c444 968 'url': embed_url,
ef4fd848
PH
969 'ie_key': 'Wistia',
970 'uploader': video_uploader,
971 'title': video_title,
972 'id': video_id,
973 }
5f6a1245 974
9471c444 975 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
976 if match:
977 return {
978 '_type': 'url_transparent',
979 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
980 'ie_key': 'Wistia',
981 'uploader': video_uploader,
982 'title': video_title,
983 'id': match.group('id')
984 }
ef4fd848 985
ee3e63e4 986 # Look for embedded blip.tv player
19dab5e6 987 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 988 if mobj:
2514d263 989 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 990 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 991 if mobj:
19dab5e6 992 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 993
fa35cdad
PH
994 # Look for embedded condenast player
995 matches = re.findall(
996 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
997 webpage)
998 if matches:
999 return {
1000 '_type': 'playlist',
1001 'entries': [{
1002 '_type': 'url',
1003 'ie_key': 'CondeNast',
1004 'url': ma,
1005 } for ma in matches],
1006 'title': video_title,
1007 'id': video_id,
1008 }
1009
c19f7764
JMF
1010 # Look for Bandcamp pages with custom domain
1011 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1012 if mobj is not None:
1013 burl = unescapeHTML(mobj.group(1))
09804265
JMF
1014 # Don't set the extractor because it can be a track url or an album
1015 return self.url_result(burl)
c19f7764 1016
f25571ff
PH
1017 # Look for embedded Vevo player
1018 mobj = re.search(
1019 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1020 if mobj is not None:
1021 return self.url_result(mobj.group('url'))
796df3c6
S
1022
1023 # Look for embedded Viddler player
cb454b33
S
1024 mobj = re.search(
1025 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1026 webpage)
796df3c6
S
1027 if mobj is not None:
1028 return self.url_result(mobj.group('url'))
f25571ff 1029
3378d67a
S
1030 # Look for NYTimes player
1031 mobj = re.search(
1032 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1033 webpage)
1034 if mobj is not None:
1035 return self.url_result(mobj.group('url'))
1036
cefdf970
S
1037 # Look for Libsyn player
1038 mobj = re.search(
1039 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1040 if mobj is not None:
1041 return self.url_result(mobj.group('url'))
1042
c0d0b01f 1043 # Look for Ooyala videos
cb454b33 1044 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 1045 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1046 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
c0d0b01f 1047 if mobj is not None:
750f9020 1048 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 1049
f076b638 1050 # Look for multiple Ooyala embeds on SBN network websites
1051 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1052 if mobj is not None:
1053 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1054 if embeds:
1055 return _playlist_from_matches(
1056 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1057
aa94a6d3 1058 # Look for Aparat videos
48099643 1059 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1060 if mobj is not None:
1061 return self.url_result(mobj.group(1), 'Aparat')
1062
c93c2ab1 1063 # Look for MPORA videos
c3f51436 1064 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1065 if mobj is not None:
1066 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1067
15c0e8e7 1068 # Look for embedded NovaMov-based player
8f89e687 1069 mobj = re.search(
8dfa187b 1070 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1071 (?P<url>http://(?:(?:embed|www)\.)?
1072 (?:novamov\.com|
1073 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1074 videoweed\.(?:es|com)|
1075 movshare\.(?:net|sx|ag)|
1076 divxstage\.(?:eu|net|ch|co|at|ag))
1077 /embed\.php.+?)\1''', webpage)
8f89e687 1078 if mobj is not None:
15c0e8e7 1079 return self.url_result(mobj.group('url'))
50f56607 1080
9834872b
PH
1081 # Look for embedded Facebook player
1082 mobj = re.search(
db1f3888 1083 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1084 if mobj is not None:
1085 return self.url_result(mobj.group('url'), 'Facebook')
1086
ca97a56e
S
1087 # Look for embedded VK player
1088 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1089 if mobj is not None:
1090 return self.url_result(mobj.group('url'), 'VK')
1091
0364fa8b
S
1092 # Look for embedded ivi player
1093 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1094 if mobj is not None:
1095 return self.url_result(mobj.group('url'), 'Ivi')
1096
db1f3888
PH
1097 # Look for embedded Huffington Post player
1098 mobj = re.search(
c3f51436 1099 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1100 if mobj is not None:
1101 return self.url_result(mobj.group('url'), 'HuffPost')
1102
1b86cc41 1103 # Look for embed.ly
1104 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1105 if mobj is not None:
1106 return self.url_result(mobj.group('url'))
1107 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1108 if mobj is not None:
1109 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1110
60cc4dc4
PH
1111 # Look for funnyordie embed
1112 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1113 if matches:
ed2d6a19
PH
1114 return _playlist_from_matches(
1115 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1116
db546cf8
S
1117 # Look for BBC iPlayer embed
1118 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1119 if matches:
476eae0c 1120 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1121
93d020dd
S
1122 # Look for embedded RUTV player
1123 rutv_url = RUTVIE._extract_url(webpage)
1124 if rutv_url:
1125 return self.url_result(rutv_url, 'RUTV')
1126
7e2ede98
JMF
1127 # Look for embedded TED player
1128 mobj = re.search(
d7cc31b6 1129 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1130 if mobj is not None:
1131 return self.url_result(mobj.group('url'), 'TED')
1132
5c386252 1133 # Look for embedded Ustream videos
1134 mobj = re.search(
1135 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1136 if mobj is not None:
1137 return self.url_result(mobj.group('url'), 'Ustream')
1138
893f8832
PH
1139 # Look for embedded arte.tv player
1140 mobj = re.search(
1141 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1142 webpage)
1143 if mobj is not None:
1144 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1145
cb3ac1c6
S
1146 # Look for embedded smotri.com player
1147 smotri_url = SmotriIE._extract_url(webpage)
1148 if smotri_url:
1149 return self.url_result(smotri_url, 'Smotri')
1150
20991253
PH
1151 # Look for embeded soundcloud player
1152 mobj = re.search(
ac645ac7 1153 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1154 webpage)
1155 if mobj is not None:
1156 url = unescapeHTML(mobj.group('url'))
1157 return self.url_result(url)
1158
826ec77f
PH
1159 # Look for embedded vulture.com player
1160 mobj = re.search(
1161 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1162 webpage)
1163 if mobj is not None:
1164 url = unescapeHTML(mobj.group('url'))
1165 return self.url_result(url, ie='Vulture')
1166
c5cd249e
JMF
1167 # Look for embedded mtvservices player
1168 mobj = re.search(
1169 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1170 webpage)
1171 if mobj is not None:
1172 url = unescapeHTML(mobj.group('url'))
1173 return self.url_result(url, ie='MTVServicesEmbedded')
1174
49807b4a
S
1175 # Look for embedded yahoo player
1176 mobj = re.search(
1177 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1178 webpage)
1179 if mobj is not None:
1180 return self.url_result(mobj.group('url'), 'Yahoo')
1181
2ef6fcb5
PH
1182 # Look for embedded sbs.com.au player
1183 mobj = re.search(
e98b8e79
PH
1184 r'''(?x)
1185 (?:
1186 <meta\s+property="og:video"\s+content=|
1187 <iframe[^>]+?src=
1188 )
1189 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1190 webpage)
1191 if mobj is not None:
1192 return self.url_result(mobj.group('url'), 'SBS')
1193
42bdd9d0
PH
1194 # Look for embedded Cinchcast player
1195 mobj = re.search(
1196 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1197 webpage)
1198 if mobj is not None:
1199 return self.url_result(mobj.group('url'), 'Cinchcast')
1200
1a94ff68 1201 mobj = re.search(
5263cdfc 1202 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
1203 webpage)
1204 if mobj is not None:
1205 return self.url_result(mobj.group('url'), 'MLB')
1206
1419fafd
S
1207 mobj = re.search(
1208 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1209 webpage)
1210 if mobj is not None:
1211 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1212
af63fed7
PH
1213 mobj = re.search(
1214 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1215 webpage)
1216 if mobj is not None:
1217 return self.url_result(mobj.group('url'), 'Livestream')
1218
255fca5e
S
1219 # Look for Zapiks embed
1220 mobj = re.search(
1221 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1222 if mobj is not None:
1223 return self.url_result(mobj.group('url'), 'Zapiks')
1224
e3216b82
NJ
1225 # Look for Kaltura embeds
1226 mobj = re.search(
1227 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1228 if mobj is not None:
1229 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1230
135c9c42
S
1231 # Look for Eagle.Platform embeds
1232 mobj = re.search(
1233 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1234 if mobj is not None:
1235 return self.url_result(mobj.group('url'), 'EaglePlatform')
1236
d47ae7f6
S
1237 # Look for ClipYou (uses Eagle.Platform) embeds
1238 mobj = re.search(
1239 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1240 if mobj is not None:
1241 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1242
f8388757
S
1243 # Look for Pladform embeds
1244 mobj = re.search(
1245 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1246 if mobj is not None:
1247 return self.url_result(mobj.group('url'), 'Pladform')
1248
ad320e9b
NJ
1249 # Look for 5min embeds
1250 mobj = re.search(
1251 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1252 if mobj is not None:
1253 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1254
ced659bb 1255 def check_video(vurl):
a0f71985
PH
1256 if YoutubeIE.suitable(vurl):
1257 return True
ced659bb
S
1258 vpath = compat_urlparse.urlparse(vurl).path
1259 vext = determine_ext(vpath)
1260 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1261
1262 def filter_video(urls):
1263 return list(filter(check_video, urls))
1264
9b122384 1265 # Start with something easy: JW Player in SWFObject
ced659bb 1266 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1267 if not found:
d981cef6 1268 # Look for gorilla-vid style embedding
ced659bb 1269 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1270 (?:
1271 jw_plugins|
1272 JWPlayerOptions|
1273 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1274 )
a0f71985
PH
1275 .*?
1276 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1277 if not found:
9b122384 1278 # Broaden the search a little bit
ced659bb 1279 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1280 if not found:
1281 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1282 found = filter_video(re.findall(
1283 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1284 if not found:
1285 # Flow player
ced659bb 1286 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1287 flowplayer\("[^"]+",\s*
1288 \{[^}]+?\}\s*,
52585fd6 1289 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1290 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1291 ''', webpage))
501f13fb
PH
1292 if not found:
1293 # Cinerama player
1294 found = re.findall(
1295 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1296 if not found:
9b122384 1297 # Try to find twitter cards info
ced659bb
S
1298 found = filter_video(re.findall(
1299 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1300 if not found:
9b122384
PH
1301 # We look for Open Graph info:
1302 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1303 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1304 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1305 if m_video_type is not None:
ced659bb 1306 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1307 if not found:
7fea7156 1308 # HTML5 video
9b32eca3 1309 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1310 if not found:
ed9a25dd 1311 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
a5a45015 1312 found = re.search(
89ef304b 1313 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
ed9a25dd 1314 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
89ef304b 1315 webpage)
84f81016
S
1316 if not found:
1317 # Look also in Refresh HTTP header
1318 refresh_header = head_response.headers.get('Refresh')
1319 if refresh_header:
ed9a25dd 1320 found = re.search(REDIRECT_REGEX, refresh_header)
b30b8698
PH
1321 if found:
1322 new_url = found.group(1)
89ef304b
PH
1323 self.report_following_redirect(new_url)
1324 return {
1325 '_type': 'url',
1326 'url': new_url,
1327 }
b30b8698 1328 if not found:
416c7fcb 1329 raise UnsupportedError(url)
9b122384 1330
b30b8698
PH
1331 entries = []
1332 for video_url in found:
1333 video_url = compat_urlparse.urljoin(url, video_url)
1334 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1335
b30b8698
PH
1336 # Sometimes, jwplayer extraction will result in a YouTube URL
1337 if YoutubeIE.suitable(video_url):
1338 entries.append(self.url_result(video_url, 'Youtube'))
1339 continue
9b122384 1340
b30b8698
PH
1341 # here's a fun little line of code for you:
1342 video_id = os.path.splitext(video_id)[0]
fc9713a1 1343
b30b8698
PH
1344 entries.append({
1345 'id': video_id,
1346 'url': video_url,
1347 'uploader': video_uploader,
1348 'title': video_title,
4d805e06 1349 'age_limit': age_limit,
b30b8698
PH
1350 })
1351
1352 if len(entries) == 1:
669f0e7c 1353 return entries[0]
b30b8698
PH
1354 else:
1355 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1356 # 'url' results don't have a title
1357 if e.get('title') is not None:
1358 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1359 return {
1360 '_type': 'playlist',
1361 'entries': entries,
1362 }