]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[eagleplatform] Add support for embeds
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
76c73715 29 xpath_text,
9b122384 30)
cfe50f04 31from .brightcove import BrightcoveIE
c0d0b01f 32from .ooyala import OoyalaIE
93d020dd 33from .rutv import RUTVIE
cb3ac1c6 34from .smotri import SmotriIE
1419fafd 35from .condenast import CondeNastIE
9b122384 36
0838239e 37
9b122384 38class GenericIE(InfoExtractor):
79649588 39 IE_DESC = 'Generic downloader that works on some sites'
9b122384 40 _VALID_URL = r'.*'
79649588 41 IE_NAME = 'generic'
cfe50f04
JMF
42 _TESTS = [
43 {
79649588 44 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 45 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 46 'info_dict': {
d360a146
S
47 'id': '13601338388002',
48 'ext': 'mp4',
79649588
PH
49 'uploader': 'www.hodiho.fr',
50 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
51 }
52 },
c19f7764
JMF
53 # bandcamp page with custom domain
54 {
79649588
PH
55 'add_ie': ['Bandcamp'],
56 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 57 'info_dict': {
fd50bf62
S
58 'id': '3235767654',
59 'ext': 'mp3',
79649588
PH
60 'title': 'The Pony Mash',
61 'uploader': 'M_Pallante',
c19f7764 62 },
79649588 63 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 64 },
eeb165e6 65 # embedded brightcove video
dd5bcdc4
JMF
66 # it also tests brightcove videos that need to set the 'Referer' in the
67 # http requests
eeb165e6 68 {
79649588
PH
69 'add_ie': ['Brightcove'],
70 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
71 'info_dict': {
72 'id': '2765128793001',
73 'ext': 'mp4',
74 'title': 'Le cours de bourse : l’analyse technique',
75 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
76 'uploader': 'BFM BUSINESS',
eeb165e6 77 },
79649588
PH
78 'params': {
79 'skip_download': True,
eeb165e6
JMF
80 },
81 },
17ab4d3b
PH
82 {
83 # https://github.com/rg3/youtube-dl/issues/2253
84 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
85 'md5': '0ba9446db037002366bab3b3eb30c88c',
86 'info_dict': {
fd50bf62
S
87 'id': '3101154703001',
88 'ext': 'mp4',
17ab4d3b
PH
89 'title': 'Still no power',
90 'uploader': 'thestar.com',
91 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
92 },
93 'add_ie': ['Brightcove'],
94 },
0479c625
S
95 {
96 'url': 'http://www.championat.com/video/football/v/87/87499.html',
97 'md5': 'fb973ecf6e4a78a67453647444222983',
98 'info_dict': {
99 'id': '3414141473001',
100 'ext': 'mp4',
101 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
102 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
103 'uploader': 'Championat',
104 },
105 },
bdf97017 106 {
37aab278 107 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
108 'add_ie': ['Brightcove'],
109 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
110 'info_dict': {
111 'id': '3866516442001',
37aab278 112 'ext': 'mp4',
bdf97017
NJ
113 'title': 'Leer mij vrouwen kennen: Aflevering 1',
114 'description': 'Leer mij vrouwen kennen: Aflevering 1',
115 'uploader': 'SBS Broadcasting',
116 },
37aab278 117 'skip': 'Restricted to Netherlands',
bdf97017 118 'params': {
37aab278 119 'skip_download': True, # m3u8 download
bdf97017
NJ
120 },
121 },
42393ce2
PH
122 # Direct link to a video
123 {
79649588 124 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
125 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
126 'info_dict': {
127 'id': 'trailer',
89ef304b 128 'ext': 'mp4',
79649588
PH
129 'title': 'trailer',
130 'upload_date': '20100513',
42393ce2 131 }
c0d0b01f
JMF
132 },
133 # ooyala video
134 {
79649588 135 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 136 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
137 'info_dict': {
138 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
139 'ext': 'mp4',
3486df38 140 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 141 },
87830900 142 'add_ie': ['Ooyala'],
c0d0b01f 143 },
f076b638 144 # multiple ooyala embeds on SBN network websites
145 {
146 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
147 'info_dict': {
148 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
150 },
151 'playlist_mincount': 3,
152 'params': {
153 'skip_download': True,
154 },
155 'add_ie': ['Ooyala'],
156 },
89ef304b
PH
157 # google redirect
158 {
159 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
160 'info_dict': {
161 'id': 'cmQHVoWB5FY',
162 'ext': 'mp4',
163 'upload_date': '20130224',
164 'uploader_id': 'TheVerge',
87830900 165 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
166 'uploader': 'The Verge',
167 'title': 'First Firefox OS phones side-by-side',
168 },
169 'params': {
170 'skip_download': False,
171 }
f55a1f0a 172 },
1b86cc41 173 # embed.ly video
174 {
175 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
176 'info_dict': {
177 'id': '9ODmcdjQcHQ',
178 'ext': 'mp4',
0a5bce56
PH
179 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
180 'upload_date': '20140225',
181 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
182 'uploader': 'Tested',
183 'uploader_id': 'testedcom',
1b86cc41 184 },
185 # No need to test YoutubeIE here
186 'params': {
187 'skip_download': True,
188 },
189 },
60cc4dc4
PH
190 # funnyordie embed
191 {
192 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
193 'info_dict': {
194 'id': '18e820ec3f',
195 'ext': 'mp4',
196 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
197 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 198 },
60cc4dc4 199 },
faa4ea68
S
200 # BBC iPlayer embeds
201 {
202 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
203 'info_dict': {
204 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
205 },
206 'playlist_mincount': 18,
207 },
93d020dd
S
208 # RUTV embed
209 {
210 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
211 'info_dict': {
212 'id': '776940',
213 'ext': 'mp4',
214 'title': 'Охотское море стало целиком российским',
215 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
216 },
217 'params': {
218 # m3u8 download
219 'skip_download': True,
220 },
aab74fa1
PH
221 },
222 # Embedded TED video
223 {
224 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 225 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 226 'info_dict': {
a8eb5a8e 227 'id': '1969',
aab74fa1 228 'ext': 'mp4',
a8eb5a8e
PH
229 'title': 'Hidden miracles of the natural world',
230 'uploader': 'Louie Schwartzberg',
231 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 232 }
60cc4dc4 233 },
5c386252 234 # Embeded Ustream video
235 {
236 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
237 'md5': '27b99cdb639c9b12a79bca876a073417',
238 'info_dict': {
ca6aada4 239 'id': '45734260',
240 'ext': 'flv',
241 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 242 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
243 }
244 },
d95e35d6
S
245 # nowvideo embed hidden behind percent encoding
246 {
247 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
248 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
249 'info_dict': {
250 'id': '06e53103ca9aa',
251 'ext': 'flv',
252 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
253 'description': 'No description',
254 },
0f2a2ba1 255 },
893f8832
PH
256 # arte embed
257 {
258 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
259 'md5': '7653032cbb25bf6c80d80f217055fa43',
260 'info_dict': {
261 'id': '048195-004_PLUS7-F',
262 'ext': 'flv',
263 'title': 'X:enius',
264 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
265 'upload_date': '20140320',
266 },
267 'params': {
268 'skip_download': 'Requires rtmpdump'
269 }
270 },
fa35cdad
PH
271 # Condé Nast embed
272 {
273 'url': 'http://www.wired.com/2014/04/honda-asimo/',
274 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
275 'info_dict': {
276 'id': '53501be369702d3275860000',
277 'ext': 'mp4',
278 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
279 }
ebd3c7b3
PH
280 },
281 # Dailymotion embed
282 {
283 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
284 'md5': '441aeeb82eb72c422c7f14ec533999cd',
285 'info_dict': {
286 'id': 'k2mm4bCdJ6CQ2i7c8o2',
287 'ext': 'mp4',
288 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
289 'uploader': 'Spi0n',
290 },
291 'add_ie': ['Dailymotion'],
2b88feed
PH
292 },
293 # YouTube embed
294 {
295 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
296 'info_dict': {
297 'id': 'FXRb4ykk4S0',
298 'ext': 'mp4',
299 'title': 'The NBL Auction 2014',
300 'uploader': 'BADMINTON England',
301 'uploader_id': 'BADMINTONEvents',
302 'upload_date': '20140603',
303 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
304 },
305 'add_ie': ['Youtube'],
306 'params': {
307 'skip_download': True,
308 }
309 },
c5cd249e
JMF
310 # MTVSercices embed
311 {
312 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
313 'md5': '35727f82f58c76d996fc188f9755b0d5',
314 'info_dict': {
315 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
316 'ext': 'mp4',
317 'title': 'Review',
318 'description': 'Mario\'s life in the fast lane has never looked so good.',
319 },
320 },
61013473 321 # YouTube embed via <data-embed-url="">
322 {
323 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 324 'info_dict': {
a8eb5a8e 325 'id': '4vAffPZIT44',
61013473 326 'ext': 'mp4',
a8eb5a8e 327 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
328 'uploader': 'Gameloft',
329 'uploader_id': 'gameloft',
a8eb5a8e
PH
330 'upload_date': '20140828',
331 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
332 },
333 'params': {
334 'skip_download': True,
61013473 335 }
c8e9a235
PH
336 },
337 # Camtasia studio
338 {
339 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
340 'playlist': [{
341 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
342 'info_dict': {
343 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
344 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
345 'ext': 'flv',
346 'duration': 2235.90,
347 }
348 }, {
349 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
350 'info_dict': {
351 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
352 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
353 'ext': 'flv',
354 'duration': 2235.93,
355 }
356 }],
357 'info_dict': {
358 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
359 }
4d805e06
PH
360 },
361 # Flowplayer
362 {
363 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
364 'md5': '9d65602bf31c6e20014319c7d07fba27',
365 'info_dict': {
366 'id': '5123ea6d5e5a7',
367 'ext': 'mp4',
368 'age_limit': 18,
369 'uploader': 'www.handjobhub.com',
d6d9186f 370 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 371 }
0990305d
PH
372 },
373 # RSS feed
374 {
375 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
376 'info_dict': {
377 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'title': 'Zero Punctuation',
b1b0b1ca 379 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
380 },
381 'playlist_mincount': 11,
22a6f150
PH
382 },
383 # Multiple brightcove videos
384 # https://github.com/rg3/youtube-dl/issues/2283
385 {
386 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
387 'info_dict': {
388 'id': 'always-never',
389 'title': 'Always / Never - The New Yorker',
390 },
391 'playlist_count': 3,
392 'params': {
393 'extract_flat': False,
394 'skip_download': True,
395 }
1a94ff68
S
396 },
397 # MLB embed
398 {
399 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
400 'md5': '96f09a37e44da40dd083e12d9a683327',
401 'info_dict': {
402 'id': '33322633',
403 'ext': 'mp4',
404 'title': 'Ump changes call to ball',
405 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
406 'duration': 48,
407 'timestamp': 1401537900,
408 'upload_date': '20140531',
409 'thumbnail': 're:^https?://.*\.jpg$',
410 },
411 },
746c67d7
NJ
412 # Wistia embed
413 {
414 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
415 'md5': '8788b683c777a5cf25621eaf286d0c23',
416 'info_dict': {
417 'id': '1cfaf6b7ea',
418 'ext': 'mov',
419 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
420 'duration': 643.0,
421 'filesize': 182808282,
422 'uploader': 'education-portal.com',
423 },
424 },
52cffcb1 425 {
426 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
427 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
428 'info_dict': {
429 'id': 'uxjb0lwrcz',
430 'ext': 'mp4',
85d7b765 431 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 432 'duration': 1715.0,
85d7b765 433 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 434 },
52cffcb1 435 },
70b7e3fb
PH
436 # Direct download with broken HEAD
437 {
438 'url': 'http://ai-radio.org:8000/radio.opus',
439 'info_dict': {
440 'id': 'radio',
441 'ext': 'opus',
442 'title': 'radio',
443 },
444 'params': {
445 'skip_download': True, # infinite live stream
446 },
447 'expected_warnings': [
448 r'501.*Not Implemented'
449 ],
ac645ac7
PH
450 },
451 # Soundcloud embed
452 {
453 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
454 'info_dict': {
455 'id': '174391317',
456 'ext': 'mp3',
457 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
458 'uploader': 'Sophos Security',
459 'title': 'Chet Chat 171 - Oct 29, 2014',
460 'upload_date': '20141029',
461 }
af63fed7
PH
462 },
463 # Livestream embed
464 {
465 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
466 'info_dict': {
467 'id': '67864563',
468 'ext': 'flv',
469 'upload_date': '20141112',
470 'title': 'Rosetta #CometLanding webcast HL 10',
471 }
472 },
65f3a228
PH
473 # LazyYT
474 {
475 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
476 'info_dict': {
11e611a7 477 'id': '1986',
65f3a228
PH
478 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
479 },
480 'playlist_mincount': 2,
4e262a88
PH
481 },
482 # Direct link with incorrect MIME type
483 {
484 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
485 'md5': '4ccbebe5f36706d85221f204d7eb5913',
486 'info_dict': {
487 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
488 'id': '5_Lennart_Poettering_-_Systemd',
489 'ext': 'webm',
490 'title': '5_Lennart_Poettering_-_Systemd',
491 'upload_date': '20141120',
492 },
493 'expected_warnings': [
494 'URL could be a direct video link, returning it as such.'
495 ]
42bdd9d0
PH
496 },
497 # Cinchcast embed
498 {
499 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
500 'info_dict': {
501 'id': '7141703',
502 'ext': 'mp3',
503 'upload_date': '20141126',
504 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
505 }
506 },
501f13fb
PH
507 # Cinerama player
508 {
509 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
510 'info_dict': {
511 'id': '730m_DandD_1901_512k',
512 'ext': 'mp4',
513 'uploader': 'www.abc.net.au',
514 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
515 }
796df3c6
S
516 },
517 # embedded viddler video
518 {
519 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
520 'info_dict': {
521 'id': '4d03aad9',
522 'ext': 'mp4',
523 'uploader': 'deadspin',
524 'title': 'WALL-TO-GORTAT',
525 'timestamp': 1422285291,
526 'upload_date': '20150126',
527 },
528 'add_ie': ['Viddler'],
a0f71985
PH
529 },
530 # jwplayer YouTube
531 {
532 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
533 'info_dict': {
534 'id': 'Mrj4DVp2zeA',
535 'ext': 'mp4',
f37e3f99 536 'upload_date': '20150212',
a0f71985
PH
537 'uploader': 'The National Archives UK',
538 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
539 'uploader_id': 'NationalArchives08',
540 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
541 },
59b8ab58
PH
542 },
543 # rtl.nl embed
544 {
545 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
546 'playlist_mincount': 5,
547 'info_dict': {
548 'id': 'aanslagen-kopenhagen',
549 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
550 }
255fca5e
S
551 },
552 # Zapiks embed
553 {
554 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
555 'info_dict': {
556 'id': '118046',
557 'ext': 'mp4',
558 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
559 }
560 },
e3216b82
NJ
561 # Kaltura embed
562 {
563 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
564 'info_dict': {
565 'id': '1_eergr3h1',
566 'ext': 'mp4',
567 'upload_date': '20150226',
568 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
569 'timestamp': int,
570 'title': 'John Carlson Postgame 2/25/15',
571 },
572 },
135c9c42
S
573 # Eagle.Platform embed (generic URL)
574 {
575 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
576 'info_dict': {
577 'id': '227304',
578 'ext': 'mp4',
579 'title': 'Навальный вышел на свободу',
580 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
581 'thumbnail': 're:^https?://.*\.jpg$',
582 'duration': 87,
583 'view_count': int,
584 'age_limit': 0,
585 },
586 },
76c73715
PH
587 # RSS feed with enclosure
588 {
589 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
590 'info_dict': {
591 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
592 'ext': 'm4v',
593 'upload_date': '20150228',
594 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
595 }
596 }
cfe50f04 597 ]
9b122384 598
9b122384
PH
599 def report_following_redirect(self, new_url):
600 """Report information extraction."""
79649588 601 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 602
4fc946b5
PH
603 def _extract_rss(self, url, video_id, doc):
604 playlist_title = doc.find('./channel/title').text
605 playlist_desc_el = doc.find('./channel/description')
606 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
607
76c73715
PH
608 entries = []
609 for it in doc.findall('./channel/item'):
610 next_url = xpath_text(it, 'link', fatal=False)
611 if not next_url:
612 enclosure_nodes = it.findall('./enclosure')
613 for e in enclosure_nodes:
614 next_url = e.attrib.get('url')
615 if next_url:
616 break
617
618 if not next_url:
619 continue
620
621 entries.append({
622 '_type': 'url',
623 'url': next_url,
624 'title': it.find('title').text,
625 })
4fc946b5
PH
626
627 return {
628 '_type': 'playlist',
629 'id': url,
630 'title': playlist_title,
631 'description': playlist_desc,
632 'entries': entries,
633 }
634
c8e9a235
PH
635 def _extract_camtasia(self, url, video_id, webpage):
636 """ Returns None if no camtasia video can be found. """
637
638 camtasia_cfg = self._search_regex(
639 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
640 webpage, 'camtasia configuration file', default=None)
641 if camtasia_cfg is None:
642 return None
643
644 title = self._html_search_meta('DC.title', webpage, fatal=True)
645
646 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
647 camtasia_cfg = self._download_xml(
648 camtasia_url, video_id,
649 note='Downloading camtasia configuration',
650 errnote='Failed to download camtasia configuration')
651 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
652
653 entries = []
654 for n in fileset_node.getchildren():
655 url_n = n.find('./uri')
656 if url_n is None:
657 continue
658
659 entries.append({
660 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
661 'title': '%s - %s' % (title, n.tag),
662 'url': compat_urlparse.urljoin(url, url_n.text),
663 'duration': float_or_none(n.find('./duration').text),
664 })
665
666 return {
667 '_type': 'playlist',
668 'entries': entries,
669 'title': title,
670 }
671
9b122384 672 def _real_extract(self, url):
ebd3c7b3
PH
673 if url.startswith('//'):
674 return {
675 '_type': 'url',
20991253 676 'url': self.http_scheme() + url,
ebd3c7b3
PH
677 }
678
a7130543
JMF
679 parsed_url = compat_urlparse.urlparse(url)
680 if not parsed_url.scheme:
04b4d394
PH
681 default_search = self._downloader.params.get('default_search')
682 if default_search is None:
1f7ccb90 683 default_search = 'fixup_error'
04b4d394 684
1f7ccb90 685 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
686 if '/' in url:
687 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
688 return self.url_result('http://' + url)
1f7ccb90 689 elif default_search != 'fixup_error':
9c1fc022 690 if default_search == 'auto_warning':
0e67ab0d
PH
691 if re.match(r'^(?:url|URL)$', url):
692 raise ExtractorError(
693 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
694 expected=True)
695 else:
696 self._downloader.report_warning(
7571c02c 697 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 698 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
699
700 if default_search in ('error', 'fixup_error'):
7571c02c 701 raise ExtractorError(
b74e86f4
PH
702 '%r is not a valid URL. '
703 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
704 % (url, url), expected=True)
04b4d394 705 else:
f2f2c0c2
PH
706 if ':' not in default_search:
707 default_search += ':'
04b4d394 708 return self.url_result(default_search + url)
4d54ef20
PH
709
710 url, smuggled_data = unsmuggle_url(url)
711 force_videoid = None
d6e6a422 712 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
713 if smuggled_data and 'force_videoid' in smuggled_data:
714 force_videoid = smuggled_data['force_videoid']
715 video_id = force_videoid
716 else:
717 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 718
79649588 719 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 720
ebab4520 721 head_req = HEADRequest(url)
23be51d8 722 head_response = self._request_webpage(
ebab4520
PH
723 head_req, video_id,
724 note=False, errnote='Could not send HEAD request to %s' % url,
725 fatal=False)
42393ce2 726
23be51d8 727 if head_response is not False:
42393ce2 728 # Check for redirect
23be51d8 729 new_url = head_response.geturl()
42393ce2
PH
730 if url != new_url:
731 self.report_following_redirect(new_url)
4d54ef20
PH
732 if force_videoid:
733 new_url = smuggle_url(
734 new_url, {'force_videoid': force_videoid})
cecaaf3f 735 return self.url_result(new_url)
42393ce2 736
23be51d8
PH
737 full_response = None
738 if head_response is False:
739 full_response = self._request_webpage(url, video_id)
740 head_response = full_response
741
742 # Check for direct link to a video
743 content_type = head_response.headers.get('Content-Type', '')
744 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
745 if m:
746 upload_date = unified_strdate(
747 head_response.headers.get('Last-Modified'))
748 return {
749 'id': video_id,
750 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 751 'direct': True,
23be51d8
PH
752 'formats': [{
753 'format_id': m.group('format_id'),
754 'url': url,
755 'vcodec': 'none' if m.group('type') == 'audio' else None
756 }],
757 'upload_date': upload_date,
758 }
42393ce2 759
d6e6a422
PH
760 if not self._downloader.params.get('test', False) and not is_intentional:
761 self._downloader.report_warning('Falling back on generic information extractor.')
762
4e262a88
PH
763 if not full_response:
764 full_response = self._request_webpage(url, video_id)
765
766 # Maybe it's a direct link to a video?
767 # Be careful not to download the whole thing!
768 first_bytes = full_response.read(512)
61ca9a80 769 if not is_html(first_bytes):
4e262a88
PH
770 self._downloader.report_warning(
771 'URL could be a direct video link, returning it as such.')
772 upload_date = unified_strdate(
773 head_response.headers.get('Last-Modified'))
774 return {
775 'id': video_id,
776 'title': os.path.splitext(url_basename(url))[0],
777 'direct': True,
778 'url': url,
779 'upload_date': upload_date,
780 }
781
782 webpage = self._webpage_read_content(
783 full_response, url, video_id, prefix=first_bytes)
784
9b122384 785 self.report_extraction(video_id)
887c6acd 786
4fc946b5
PH
787 # Is it an RSS feed?
788 try:
bcf89ce6 789 doc = parse_xml(webpage)
4fc946b5
PH
790 if doc.tag == 'rss':
791 return self._extract_rss(url, video_id, doc)
f7300c5c 792 except compat_xml_parse_error:
4fc946b5
PH
793 pass
794
c8e9a235
PH
795 # Is it a Camtasia project?
796 camtasia_res = self._extract_camtasia(url, video_id, webpage)
797 if camtasia_res is not None:
798 return camtasia_res
799
14390730
S
800 # Sometimes embedded video player is hidden behind percent encoding
801 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
802 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
803 webpage = compat_urllib_parse.unquote(webpage)
804
887c6acd
PH
805 # it's tempting to parse this further, but you would
806 # have to take into account all the variations like
807 # Video Title - Site Name
808 # Site Name | Video Title
809 # Video Title - Tagline | Site Name
810 # and so on and so forth; it's just not practical
ef4fd848 811 video_title = self._html_search_regex(
79649588
PH
812 r'(?s)<title>(.*?)</title>', webpage, 'video title',
813 default='video')
ef4fd848 814
4d805e06
PH
815 # Try to detect age limit automatically
816 age_limit = self._rta_search(webpage)
817 # And then there are the jokers who advertise that they use RTA,
818 # but actually don't.
819 AGE_LIMIT_MARKERS = [
820 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
821 ]
822 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
823 age_limit = 18
824
ef4fd848
PH
825 # video uploader is domain name
826 video_uploader = self._search_regex(
79649588 827 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 828
ed2d6a19 829 # Helper method
83992676 830 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 831 urlrs = orderedSet(
83992676 832 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 833 for m in matches)
ed2d6a19
PH
834 return self.playlist_result(
835 urlrs, playlist_id=video_id, playlist_title=video_title)
836
627a91a9 837 # Look for BrightCove:
99877772
PH
838 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
839 if bc_urls:
79649588 840 self.to_screen('Brightcove video detected.')
99877772
PH
841 entries = [{
842 '_type': 'url',
843 'url': smuggle_url(bc_url, {'Referer': url}),
844 'ie_key': 'Brightcove'
845 } for bc_url in bc_urls]
846
847 return {
848 '_type': 'playlist',
849 'title': video_title,
850 'id': video_id,
851 'entries': entries,
852 }
cfe50f04 853
59b8ab58
PH
854 # Look for embedded rtl.nl player
855 matches = re.findall(
856 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
857 webpage)
858 if matches:
859 return _playlist_from_matches(matches, ie='RtlNl')
860
7115ca84 861 # Look for embedded (iframe) Vimeo player
9d4660ca 862 mobj = re.search(
15fd51b3 863 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 864 if mobj:
15fd51b3 865 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 866 surl = smuggle_url(player_url, {'Referer': url})
09a42738 867 return self.url_result(surl)
7115ca84
PH
868 # Look for embedded (swf embed) Vimeo player
869 mobj = re.search(
09a42738 870 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 871 if mobj:
09a42738 872 return self.url_result(mobj.group(1))
7115ca84 873
53c1d3ef 874 # Look for embedded YouTube player
1f9da904 875 matches = re.findall(r'''(?x)
2b88feed
PH
876 (?:
877 <iframe[^>]+?src=|
c71dfccc 878 data-video-url=|
2b88feed 879 <embed[^>]+?src=|
a7e97f6d
PH
880 embedSWF\(?:\s*|
881 new\s+SWFObject\(
2b88feed
PH
882 )
883 (["\'])
1bf5423e 884 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 885 (?:embed|v|p)/.+?)
1f9da904 886 \1''', webpage)
887c6acd 887 if matches:
ed2d6a19 888 return _playlist_from_matches(
3b2f933b 889 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 890
65f3a228
PH
891 # Look for lazyYT YouTube embed
892 matches = re.findall(
893 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
894 if matches:
895 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
896
355e4fd0
PH
897 # Look for embedded Dailymotion player
898 matches = re.findall(
ef4fd848 899 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 900 if matches:
ed2d6a19
PH
901 return _playlist_from_matches(
902 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 903
8489578d
NJ
904 # Look for embedded Dailymotion playlist player (#3822)
905 m = re.search(
906 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
907 if m:
908 playlists = re.findall(
909 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
910 if playlists:
911 return _playlist_from_matches(
912 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
913
ef4fd848
PH
914 # Look for embedded Wistia player
915 match = re.search(
281d3f1d 916 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 917 if match:
9471c444
NJ
918 embed_url = self._proto_relative_url(
919 unescapeHTML(match.group('url')))
ef4fd848
PH
920 return {
921 '_type': 'url_transparent',
9471c444 922 'url': embed_url,
ef4fd848
PH
923 'ie_key': 'Wistia',
924 'uploader': video_uploader,
925 'title': video_title,
926 'id': video_id,
927 }
5f6a1245 928
9471c444 929 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
930 if match:
931 return {
932 '_type': 'url_transparent',
933 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
934 'ie_key': 'Wistia',
935 'uploader': video_uploader,
936 'title': video_title,
937 'id': match.group('id')
938 }
ef4fd848 939
ee3e63e4 940 # Look for embedded blip.tv player
19dab5e6 941 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 942 if mobj:
2514d263 943 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 944 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 945 if mobj:
19dab5e6 946 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 947
fa35cdad
PH
948 # Look for embedded condenast player
949 matches = re.findall(
950 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
951 webpage)
952 if matches:
953 return {
954 '_type': 'playlist',
955 'entries': [{
956 '_type': 'url',
957 'ie_key': 'CondeNast',
958 'url': ma,
959 } for ma in matches],
960 'title': video_title,
961 'id': video_id,
962 }
963
c19f7764
JMF
964 # Look for Bandcamp pages with custom domain
965 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
966 if mobj is not None:
967 burl = unescapeHTML(mobj.group(1))
09804265
JMF
968 # Don't set the extractor because it can be a track url or an album
969 return self.url_result(burl)
c19f7764 970
f25571ff
PH
971 # Look for embedded Vevo player
972 mobj = re.search(
973 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
974 if mobj is not None:
975 return self.url_result(mobj.group('url'))
796df3c6
S
976
977 # Look for embedded Viddler player
cb454b33
S
978 mobj = re.search(
979 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
980 webpage)
796df3c6
S
981 if mobj is not None:
982 return self.url_result(mobj.group('url'))
f25571ff 983
c0d0b01f 984 # Look for Ooyala videos
cb454b33 985 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 986 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
987 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
c0d0b01f 988 if mobj is not None:
750f9020 989 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 990
f076b638 991 # Look for multiple Ooyala embeds on SBN network websites
992 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
993 if mobj is not None:
994 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
995 if embeds:
996 return _playlist_from_matches(
997 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
998
aa94a6d3 999 # Look for Aparat videos
48099643 1000 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1001 if mobj is not None:
1002 return self.url_result(mobj.group(1), 'Aparat')
1003
c93c2ab1 1004 # Look for MPORA videos
c3f51436 1005 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1006 if mobj is not None:
1007 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1008
15c0e8e7 1009 # Look for embedded NovaMov-based player
8f89e687 1010 mobj = re.search(
8dfa187b 1011 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1012 (?P<url>http://(?:(?:embed|www)\.)?
1013 (?:novamov\.com|
1014 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1015 videoweed\.(?:es|com)|
1016 movshare\.(?:net|sx|ag)|
1017 divxstage\.(?:eu|net|ch|co|at|ag))
1018 /embed\.php.+?)\1''', webpage)
8f89e687 1019 if mobj is not None:
15c0e8e7 1020 return self.url_result(mobj.group('url'))
50f56607 1021
9834872b
PH
1022 # Look for embedded Facebook player
1023 mobj = re.search(
db1f3888 1024 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1025 if mobj is not None:
1026 return self.url_result(mobj.group('url'), 'Facebook')
1027
ca97a56e
S
1028 # Look for embedded VK player
1029 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1030 if mobj is not None:
1031 return self.url_result(mobj.group('url'), 'VK')
1032
0364fa8b
S
1033 # Look for embedded ivi player
1034 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1035 if mobj is not None:
1036 return self.url_result(mobj.group('url'), 'Ivi')
1037
db1f3888
PH
1038 # Look for embedded Huffington Post player
1039 mobj = re.search(
c3f51436 1040 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1041 if mobj is not None:
1042 return self.url_result(mobj.group('url'), 'HuffPost')
1043
1b86cc41 1044 # Look for embed.ly
1045 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1046 if mobj is not None:
1047 return self.url_result(mobj.group('url'))
1048 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1049 if mobj is not None:
1050 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1051
60cc4dc4
PH
1052 # Look for funnyordie embed
1053 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1054 if matches:
ed2d6a19
PH
1055 return _playlist_from_matches(
1056 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1057
db546cf8
S
1058 # Look for BBC iPlayer embed
1059 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1060 if matches:
476eae0c 1061 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1062
93d020dd
S
1063 # Look for embedded RUTV player
1064 rutv_url = RUTVIE._extract_url(webpage)
1065 if rutv_url:
1066 return self.url_result(rutv_url, 'RUTV')
1067
7e2ede98
JMF
1068 # Look for embedded TED player
1069 mobj = re.search(
d7cc31b6 1070 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1071 if mobj is not None:
1072 return self.url_result(mobj.group('url'), 'TED')
1073
5c386252 1074 # Look for embedded Ustream videos
1075 mobj = re.search(
1076 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1077 if mobj is not None:
1078 return self.url_result(mobj.group('url'), 'Ustream')
1079
893f8832
PH
1080 # Look for embedded arte.tv player
1081 mobj = re.search(
1082 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1083 webpage)
1084 if mobj is not None:
1085 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1086
cb3ac1c6
S
1087 # Look for embedded smotri.com player
1088 smotri_url = SmotriIE._extract_url(webpage)
1089 if smotri_url:
1090 return self.url_result(smotri_url, 'Smotri')
1091
20991253
PH
1092 # Look for embeded soundcloud player
1093 mobj = re.search(
ac645ac7 1094 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1095 webpage)
1096 if mobj is not None:
1097 url = unescapeHTML(mobj.group('url'))
1098 return self.url_result(url)
1099
826ec77f
PH
1100 # Look for embedded vulture.com player
1101 mobj = re.search(
1102 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1103 webpage)
1104 if mobj is not None:
1105 url = unescapeHTML(mobj.group('url'))
1106 return self.url_result(url, ie='Vulture')
1107
c5cd249e
JMF
1108 # Look for embedded mtvservices player
1109 mobj = re.search(
1110 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1111 webpage)
1112 if mobj is not None:
1113 url = unescapeHTML(mobj.group('url'))
1114 return self.url_result(url, ie='MTVServicesEmbedded')
1115
49807b4a
S
1116 # Look for embedded yahoo player
1117 mobj = re.search(
1118 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1119 webpage)
1120 if mobj is not None:
1121 return self.url_result(mobj.group('url'), 'Yahoo')
1122
2ef6fcb5
PH
1123 # Look for embedded sbs.com.au player
1124 mobj = re.search(
e98b8e79
PH
1125 r'''(?x)
1126 (?:
1127 <meta\s+property="og:video"\s+content=|
1128 <iframe[^>]+?src=
1129 )
1130 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1131 webpage)
1132 if mobj is not None:
1133 return self.url_result(mobj.group('url'), 'SBS')
1134
42bdd9d0
PH
1135 # Look for embedded Cinchcast player
1136 mobj = re.search(
1137 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1138 webpage)
1139 if mobj is not None:
1140 return self.url_result(mobj.group('url'), 'Cinchcast')
1141
1a94ff68 1142 mobj = re.search(
5263cdfc 1143 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
1144 webpage)
1145 if mobj is not None:
1146 return self.url_result(mobj.group('url'), 'MLB')
1147
1419fafd
S
1148 mobj = re.search(
1149 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1150 webpage)
1151 if mobj is not None:
1152 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1153
af63fed7
PH
1154 mobj = re.search(
1155 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1156 webpage)
1157 if mobj is not None:
1158 return self.url_result(mobj.group('url'), 'Livestream')
1159
255fca5e
S
1160 # Look for Zapiks embed
1161 mobj = re.search(
1162 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1163 if mobj is not None:
1164 return self.url_result(mobj.group('url'), 'Zapiks')
1165
e3216b82
NJ
1166 # Look for Kaltura embeds
1167 mobj = re.search(
1168 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1169 if mobj is not None:
1170 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1171
135c9c42
S
1172 # Look for Eagle.Platform embeds
1173 mobj = re.search(
1174 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1175 if mobj is not None:
1176 return self.url_result(mobj.group('url'), 'EaglePlatform')
1177
ced659bb 1178 def check_video(vurl):
a0f71985
PH
1179 if YoutubeIE.suitable(vurl):
1180 return True
ced659bb
S
1181 vpath = compat_urlparse.urlparse(vurl).path
1182 vext = determine_ext(vpath)
1183 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1184
1185 def filter_video(urls):
1186 return list(filter(check_video, urls))
1187
9b122384 1188 # Start with something easy: JW Player in SWFObject
ced659bb 1189 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1190 if not found:
d981cef6 1191 # Look for gorilla-vid style embedding
ced659bb 1192 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1193 (?:
1194 jw_plugins|
1195 JWPlayerOptions|
1196 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1197 )
a0f71985
PH
1198 .*?
1199 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1200 if not found:
9b122384 1201 # Broaden the search a little bit
ced659bb 1202 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1203 if not found:
1204 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1205 found = filter_video(re.findall(
1206 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1207 if not found:
1208 # Flow player
ced659bb 1209 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1210 flowplayer\("[^"]+",\s*
1211 \{[^}]+?\}\s*,
52585fd6 1212 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1213 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1214 ''', webpage))
501f13fb
PH
1215 if not found:
1216 # Cinerama player
1217 found = re.findall(
1218 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1219 if not found:
9b122384 1220 # Try to find twitter cards info
ced659bb
S
1221 found = filter_video(re.findall(
1222 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1223 if not found:
9b122384
PH
1224 # We look for Open Graph info:
1225 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1226 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1227 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1228 if m_video_type is not None:
ced659bb 1229 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1230 if not found:
7fea7156 1231 # HTML5 video
9b32eca3 1232 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1233 if not found:
a5a45015 1234 found = re.search(
89ef304b 1235 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 1236 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 1237 webpage)
b30b8698
PH
1238 if found:
1239 new_url = found.group(1)
89ef304b
PH
1240 self.report_following_redirect(new_url)
1241 return {
1242 '_type': 'url',
1243 'url': new_url,
1244 }
b30b8698 1245 if not found:
416c7fcb 1246 raise UnsupportedError(url)
9b122384 1247
b30b8698
PH
1248 entries = []
1249 for video_url in found:
1250 video_url = compat_urlparse.urljoin(url, video_url)
1251 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1252
b30b8698
PH
1253 # Sometimes, jwplayer extraction will result in a YouTube URL
1254 if YoutubeIE.suitable(video_url):
1255 entries.append(self.url_result(video_url, 'Youtube'))
1256 continue
9b122384 1257
b30b8698
PH
1258 # here's a fun little line of code for you:
1259 video_id = os.path.splitext(video_id)[0]
fc9713a1 1260
b30b8698
PH
1261 entries.append({
1262 'id': video_id,
1263 'url': video_url,
1264 'uploader': video_uploader,
1265 'title': video_title,
4d805e06 1266 'age_limit': age_limit,
b30b8698
PH
1267 })
1268
1269 if len(entries) == 1:
669f0e7c 1270 return entries[0]
b30b8698
PH
1271 else:
1272 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1273 # 'url' results don't have a title
1274 if e.get('title') is not None:
1275 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1276 return {
1277 '_type': 'playlist',
1278 'entries': entries,
1279 }