]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[generic] Add support for playwire embeds (Closes #5430)
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
76c73715 29 xpath_text,
9b122384 30)
cfe50f04 31from .brightcove import BrightcoveIE
a2edf2e7 32from .nbc import NBCSportsVPlayerIE
c0d0b01f 33from .ooyala import OoyalaIE
93d020dd 34from .rutv import RUTVIE
cb3ac1c6 35from .smotri import SmotriIE
1419fafd 36from .condenast import CondeNastIE
418c5cc3 37from .udn import UDNEmbedIE
9b122384 38
0838239e 39
9b122384 40class GenericIE(InfoExtractor):
79649588 41 IE_DESC = 'Generic downloader that works on some sites'
9b122384 42 _VALID_URL = r'.*'
79649588 43 IE_NAME = 'generic'
cfe50f04
JMF
44 _TESTS = [
45 {
79649588 46 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 47 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 48 'info_dict': {
d360a146
S
49 'id': '13601338388002',
50 'ext': 'mp4',
79649588
PH
51 'uploader': 'www.hodiho.fr',
52 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
53 }
54 },
c19f7764
JMF
55 # bandcamp page with custom domain
56 {
79649588
PH
57 'add_ie': ['Bandcamp'],
58 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 59 'info_dict': {
fd50bf62
S
60 'id': '3235767654',
61 'ext': 'mp3',
79649588
PH
62 'title': 'The Pony Mash',
63 'uploader': 'M_Pallante',
c19f7764 64 },
79649588 65 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 66 },
eeb165e6 67 # embedded brightcove video
dd5bcdc4
JMF
68 # it also tests brightcove videos that need to set the 'Referer' in the
69 # http requests
eeb165e6 70 {
79649588
PH
71 'add_ie': ['Brightcove'],
72 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73 'info_dict': {
74 'id': '2765128793001',
75 'ext': 'mp4',
76 'title': 'Le cours de bourse : l’analyse technique',
77 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
78 'uploader': 'BFM BUSINESS',
eeb165e6 79 },
79649588
PH
80 'params': {
81 'skip_download': True,
eeb165e6
JMF
82 },
83 },
17ab4d3b
PH
84 {
85 # https://github.com/rg3/youtube-dl/issues/2253
86 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
87 'md5': '0ba9446db037002366bab3b3eb30c88c',
88 'info_dict': {
fd50bf62
S
89 'id': '3101154703001',
90 'ext': 'mp4',
17ab4d3b
PH
91 'title': 'Still no power',
92 'uploader': 'thestar.com',
93 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94 },
95 'add_ie': ['Brightcove'],
96 },
0479c625
S
97 {
98 'url': 'http://www.championat.com/video/football/v/87/87499.html',
99 'md5': 'fb973ecf6e4a78a67453647444222983',
100 'info_dict': {
101 'id': '3414141473001',
102 'ext': 'mp4',
103 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
104 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
105 'uploader': 'Championat',
106 },
107 },
bdf97017 108 {
37aab278 109 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
110 'add_ie': ['Brightcove'],
111 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112 'info_dict': {
113 'id': '3866516442001',
37aab278 114 'ext': 'mp4',
bdf97017
NJ
115 'title': 'Leer mij vrouwen kennen: Aflevering 1',
116 'description': 'Leer mij vrouwen kennen: Aflevering 1',
117 'uploader': 'SBS Broadcasting',
118 },
37aab278 119 'skip': 'Restricted to Netherlands',
bdf97017 120 'params': {
37aab278 121 'skip_download': True, # m3u8 download
bdf97017
NJ
122 },
123 },
42393ce2
PH
124 # Direct link to a video
125 {
79649588 126 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
127 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
128 'info_dict': {
129 'id': 'trailer',
89ef304b 130 'ext': 'mp4',
79649588
PH
131 'title': 'trailer',
132 'upload_date': '20100513',
42393ce2 133 }
c0d0b01f
JMF
134 },
135 # ooyala video
136 {
79649588 137 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 138 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
139 'info_dict': {
140 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141 'ext': 'mp4',
3486df38 142 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 143 },
87830900 144 'add_ie': ['Ooyala'],
c0d0b01f 145 },
f076b638 146 # multiple ooyala embeds on SBN network websites
147 {
148 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'info_dict': {
150 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
151 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152 },
153 'playlist_mincount': 3,
154 'params': {
155 'skip_download': True,
156 },
157 'add_ie': ['Ooyala'],
158 },
89ef304b
PH
159 # google redirect
160 {
161 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
162 'info_dict': {
163 'id': 'cmQHVoWB5FY',
164 'ext': 'mp4',
165 'upload_date': '20130224',
166 'uploader_id': 'TheVerge',
87830900 167 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
168 'uploader': 'The Verge',
169 'title': 'First Firefox OS phones side-by-side',
170 },
171 'params': {
172 'skip_download': False,
173 }
f55a1f0a 174 },
1b86cc41 175 # embed.ly video
176 {
177 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
178 'info_dict': {
179 'id': '9ODmcdjQcHQ',
180 'ext': 'mp4',
0a5bce56
PH
181 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
182 'upload_date': '20140225',
183 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
184 'uploader': 'Tested',
185 'uploader_id': 'testedcom',
1b86cc41 186 },
187 # No need to test YoutubeIE here
188 'params': {
189 'skip_download': True,
190 },
191 },
60cc4dc4
PH
192 # funnyordie embed
193 {
194 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
195 'info_dict': {
196 'id': '18e820ec3f',
197 'ext': 'mp4',
198 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
199 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 200 },
60cc4dc4 201 },
faa4ea68
S
202 # BBC iPlayer embeds
203 {
204 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205 'info_dict': {
206 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
207 },
208 'playlist_mincount': 18,
209 },
93d020dd
S
210 # RUTV embed
211 {
212 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
213 'info_dict': {
214 'id': '776940',
215 'ext': 'mp4',
216 'title': 'Охотское море стало целиком российским',
217 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
218 },
219 'params': {
220 # m3u8 download
221 'skip_download': True,
222 },
aab74fa1
PH
223 },
224 # Embedded TED video
225 {
226 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 227 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 228 'info_dict': {
a8eb5a8e 229 'id': '1969',
aab74fa1 230 'ext': 'mp4',
a8eb5a8e
PH
231 'title': 'Hidden miracles of the natural world',
232 'uploader': 'Louie Schwartzberg',
233 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 234 }
60cc4dc4 235 },
5c386252 236 # Embeded Ustream video
237 {
238 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
239 'md5': '27b99cdb639c9b12a79bca876a073417',
240 'info_dict': {
ca6aada4 241 'id': '45734260',
242 'ext': 'flv',
243 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 244 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
245 }
246 },
d95e35d6
S
247 # nowvideo embed hidden behind percent encoding
248 {
249 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
250 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251 'info_dict': {
252 'id': '06e53103ca9aa',
253 'ext': 'flv',
254 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
255 'description': 'No description',
256 },
0f2a2ba1 257 },
893f8832
PH
258 # arte embed
259 {
260 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
261 'md5': '7653032cbb25bf6c80d80f217055fa43',
262 'info_dict': {
263 'id': '048195-004_PLUS7-F',
264 'ext': 'flv',
265 'title': 'X:enius',
266 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
267 'upload_date': '20140320',
268 },
269 'params': {
270 'skip_download': 'Requires rtmpdump'
271 }
272 },
fa35cdad
PH
273 # Condé Nast embed
274 {
275 'url': 'http://www.wired.com/2014/04/honda-asimo/',
276 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277 'info_dict': {
278 'id': '53501be369702d3275860000',
279 'ext': 'mp4',
280 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
281 }
ebd3c7b3
PH
282 },
283 # Dailymotion embed
284 {
285 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
286 'md5': '441aeeb82eb72c422c7f14ec533999cd',
287 'info_dict': {
288 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289 'ext': 'mp4',
290 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
291 'uploader': 'Spi0n',
292 },
293 'add_ie': ['Dailymotion'],
2b88feed
PH
294 },
295 # YouTube embed
296 {
297 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
298 'info_dict': {
299 'id': 'FXRb4ykk4S0',
300 'ext': 'mp4',
301 'title': 'The NBL Auction 2014',
302 'uploader': 'BADMINTON England',
303 'uploader_id': 'BADMINTONEvents',
304 'upload_date': '20140603',
305 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306 },
307 'add_ie': ['Youtube'],
308 'params': {
309 'skip_download': True,
310 }
311 },
c5cd249e
JMF
312 # MTVSercices embed
313 {
314 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
315 'md5': '35727f82f58c76d996fc188f9755b0d5',
316 'info_dict': {
317 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
318 'ext': 'mp4',
319 'title': 'Review',
320 'description': 'Mario\'s life in the fast lane has never looked so good.',
321 },
322 },
61013473 323 # YouTube embed via <data-embed-url="">
324 {
325 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 326 'info_dict': {
a8eb5a8e 327 'id': '4vAffPZIT44',
61013473 328 'ext': 'mp4',
a8eb5a8e 329 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
330 'uploader': 'Gameloft',
331 'uploader_id': 'gameloft',
a8eb5a8e
PH
332 'upload_date': '20140828',
333 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
334 },
335 'params': {
336 'skip_download': True,
61013473 337 }
c8e9a235
PH
338 },
339 # Camtasia studio
340 {
341 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342 'playlist': [{
343 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344 'info_dict': {
345 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
346 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
347 'ext': 'flv',
348 'duration': 2235.90,
349 }
350 }, {
351 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352 'info_dict': {
353 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
354 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
355 'ext': 'flv',
356 'duration': 2235.93,
357 }
358 }],
359 'info_dict': {
360 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
361 }
4d805e06
PH
362 },
363 # Flowplayer
364 {
365 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
366 'md5': '9d65602bf31c6e20014319c7d07fba27',
367 'info_dict': {
368 'id': '5123ea6d5e5a7',
369 'ext': 'mp4',
370 'age_limit': 18,
371 'uploader': 'www.handjobhub.com',
d6d9186f 372 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 373 }
0990305d
PH
374 },
375 # RSS feed
376 {
377 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'info_dict': {
379 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
380 'title': 'Zero Punctuation',
b1b0b1ca 381 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
382 },
383 'playlist_mincount': 11,
22a6f150
PH
384 },
385 # Multiple brightcove videos
386 # https://github.com/rg3/youtube-dl/issues/2283
387 {
388 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389 'info_dict': {
390 'id': 'always-never',
391 'title': 'Always / Never - The New Yorker',
392 },
393 'playlist_count': 3,
394 'params': {
395 'extract_flat': False,
396 'skip_download': True,
397 }
1a94ff68
S
398 },
399 # MLB embed
400 {
401 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
402 'md5': '96f09a37e44da40dd083e12d9a683327',
403 'info_dict': {
404 'id': '33322633',
405 'ext': 'mp4',
406 'title': 'Ump changes call to ball',
407 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408 'duration': 48,
409 'timestamp': 1401537900,
410 'upload_date': '20140531',
411 'thumbnail': 're:^https?://.*\.jpg$',
412 },
413 },
746c67d7
NJ
414 # Wistia embed
415 {
416 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
417 'md5': '8788b683c777a5cf25621eaf286d0c23',
418 'info_dict': {
419 'id': '1cfaf6b7ea',
420 'ext': 'mov',
421 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422 'duration': 643.0,
423 'filesize': 182808282,
424 'uploader': 'education-portal.com',
425 },
426 },
52cffcb1 427 {
428 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
429 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
430 'info_dict': {
431 'id': 'uxjb0lwrcz',
432 'ext': 'mp4',
85d7b765 433 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 434 'duration': 1715.0,
85d7b765 435 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 436 },
52cffcb1 437 },
70b7e3fb
PH
438 # Direct download with broken HEAD
439 {
440 'url': 'http://ai-radio.org:8000/radio.opus',
441 'info_dict': {
442 'id': 'radio',
443 'ext': 'opus',
444 'title': 'radio',
445 },
446 'params': {
447 'skip_download': True, # infinite live stream
448 },
449 'expected_warnings': [
450 r'501.*Not Implemented'
451 ],
ac645ac7
PH
452 },
453 # Soundcloud embed
454 {
455 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
456 'info_dict': {
457 'id': '174391317',
458 'ext': 'mp3',
459 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
460 'uploader': 'Sophos Security',
461 'title': 'Chet Chat 171 - Oct 29, 2014',
462 'upload_date': '20141029',
463 }
af63fed7
PH
464 },
465 # Livestream embed
466 {
467 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
468 'info_dict': {
469 'id': '67864563',
470 'ext': 'flv',
471 'upload_date': '20141112',
472 'title': 'Rosetta #CometLanding webcast HL 10',
473 }
474 },
65f3a228
PH
475 # LazyYT
476 {
477 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
478 'info_dict': {
11e611a7 479 'id': '1986',
65f3a228
PH
480 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481 },
482 'playlist_mincount': 2,
4e262a88
PH
483 },
484 # Direct link with incorrect MIME type
485 {
486 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
487 'md5': '4ccbebe5f36706d85221f204d7eb5913',
488 'info_dict': {
489 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
490 'id': '5_Lennart_Poettering_-_Systemd',
491 'ext': 'webm',
492 'title': '5_Lennart_Poettering_-_Systemd',
493 'upload_date': '20141120',
494 },
495 'expected_warnings': [
496 'URL could be a direct video link, returning it as such.'
497 ]
42bdd9d0
PH
498 },
499 # Cinchcast embed
500 {
501 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
502 'info_dict': {
503 'id': '7141703',
504 'ext': 'mp3',
505 'upload_date': '20141126',
506 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
507 }
508 },
501f13fb
PH
509 # Cinerama player
510 {
511 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512 'info_dict': {
513 'id': '730m_DandD_1901_512k',
514 'ext': 'mp4',
515 'uploader': 'www.abc.net.au',
516 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
517 }
796df3c6
S
518 },
519 # embedded viddler video
520 {
521 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
522 'info_dict': {
523 'id': '4d03aad9',
524 'ext': 'mp4',
525 'uploader': 'deadspin',
526 'title': 'WALL-TO-GORTAT',
527 'timestamp': 1422285291,
528 'upload_date': '20150126',
529 },
530 'add_ie': ['Viddler'],
a0f71985 531 },
2051acde
S
532 # Libsyn embed
533 {
534 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
535 'info_dict': {
536 'id': '3377616',
537 'ext': 'mp3',
538 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
539 'description': 'md5:601cb790edd05908957dae8aaa866465',
540 'upload_date': '20150220',
541 },
542 },
a0f71985
PH
543 # jwplayer YouTube
544 {
545 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
546 'info_dict': {
547 'id': 'Mrj4DVp2zeA',
548 'ext': 'mp4',
f37e3f99 549 'upload_date': '20150212',
a0f71985
PH
550 'uploader': 'The National Archives UK',
551 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
552 'uploader_id': 'NationalArchives08',
553 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
554 },
59b8ab58
PH
555 },
556 # rtl.nl embed
557 {
558 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
559 'playlist_mincount': 5,
560 'info_dict': {
561 'id': 'aanslagen-kopenhagen',
562 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
563 }
255fca5e
S
564 },
565 # Zapiks embed
566 {
567 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
568 'info_dict': {
569 'id': '118046',
570 'ext': 'mp4',
571 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
572 }
573 },
e3216b82
NJ
574 # Kaltura embed
575 {
576 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
577 'info_dict': {
578 'id': '1_eergr3h1',
579 'ext': 'mp4',
580 'upload_date': '20150226',
581 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582 'timestamp': int,
583 'title': 'John Carlson Postgame 2/25/15',
584 },
585 },
135c9c42
S
586 # Eagle.Platform embed (generic URL)
587 {
588 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
589 'info_dict': {
590 'id': '227304',
591 'ext': 'mp4',
592 'title': 'Навальный вышел на свободу',
593 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
594 'thumbnail': 're:^https?://.*\.jpg$',
595 'duration': 87,
596 'view_count': int,
597 'age_limit': 0,
598 },
599 },
d47ae7f6
S
600 # ClipYou (Eagle.Platform) embed (custom URL)
601 {
602 'url': 'http://muz-tv.ru/play/7129/',
603 'info_dict': {
604 'id': '12820',
605 'ext': 'mp4',
606 'title': "'O Sole Mio",
607 'thumbnail': 're:^https?://.*\.jpg$',
608 'duration': 216,
609 'view_count': int,
610 },
611 },
f8388757
S
612 # Pladform embed
613 {
614 'url': 'http://muz-tv.ru/kinozal/view/7400/',
615 'info_dict': {
616 'id': '100183293',
617 'ext': 'mp4',
618 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
619 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
620 'thumbnail': 're:^https?://.*\.jpg$',
621 'duration': 694,
622 'age_limit': 0,
623 },
624 },
ad320e9b
NJ
625 # 5min embed
626 {
627 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
628 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
629 'info_dict': {
630 'id': '518726732',
631 'ext': 'mp4',
632 'title': 'Facebook Creates "On This Day" | Crunch Report',
633 },
634 },
76c73715
PH
635 # RSS feed with enclosure
636 {
637 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
638 'info_dict': {
639 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
640 'ext': 'm4v',
641 'upload_date': '20150228',
642 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
643 }
a2edf2e7 644 },
a4257017
S
645 # Crooks and Liars embed
646 {
647 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
648 'info_dict': {
649 'id': '8RUoRhRi',
650 'ext': 'mp4',
651 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
652 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
653 'timestamp': 1428207000,
654 'upload_date': '20150405',
655 'uploader': 'Heather',
656 },
657 },
658 # Crooks and Liars external embed
659 {
660 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
661 'info_dict': {
662 'id': 'MTE3MjUtMzQ2MzA',
663 'ext': 'mp4',
664 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
665 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
666 'timestamp': 1265032391,
667 'upload_date': '20100201',
668 'uploader': 'Heather',
669 },
670 },
facecb84 671 # NBC Sports vplayer embed
a2edf2e7 672 {
facecb84 673 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
a2edf2e7 674 'info_dict': {
facecb84
S
675 'id': 'ln7x1qSThw4k',
676 'ext': 'flv',
677 'title': "PFT Live: New leader in the 'new-look' defense",
678 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
a2edf2e7 679 },
418c5cc3
YCH
680 },
681 # UDN embed
682 {
683 'url': 'http://www.udn.com/news/story/7314/822787',
01c58f84 684 'md5': 'fd2060e988c326991037b9aff9df21a6',
418c5cc3 685 'info_dict': {
01c58f84 686 'id': '300346',
418c5cc3 687 'ext': 'mp4',
01c58f84 688 'title': '中一中男師變性 全校師生力挺',
418c5cc3
YCH
689 'thumbnail': 're:^https?://.*\.jpg$',
690 }
edfcf7ab
YCH
691 },
692 # Ooyala embed
693 {
694 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
695 'info_dict': {
696 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
697 'ext': 'mp4',
698 'description': 'VIDEO: Index/Match versus VLOOKUP.',
699 'title': 'This is what separates the Excel masters from the wannabes',
700 },
701 'params': {
702 # m3u8 downloads
703 'skip_download': True,
704 }
76c73715 705 }
cfe50f04 706 ]
9b122384 707
9b122384
PH
708 def report_following_redirect(self, new_url):
709 """Report information extraction."""
79649588 710 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 711
4fc946b5
PH
712 def _extract_rss(self, url, video_id, doc):
713 playlist_title = doc.find('./channel/title').text
714 playlist_desc_el = doc.find('./channel/description')
715 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
716
76c73715
PH
717 entries = []
718 for it in doc.findall('./channel/item'):
719 next_url = xpath_text(it, 'link', fatal=False)
720 if not next_url:
721 enclosure_nodes = it.findall('./enclosure')
722 for e in enclosure_nodes:
723 next_url = e.attrib.get('url')
724 if next_url:
725 break
726
727 if not next_url:
728 continue
729
730 entries.append({
731 '_type': 'url',
732 'url': next_url,
733 'title': it.find('title').text,
734 })
4fc946b5
PH
735
736 return {
737 '_type': 'playlist',
738 'id': url,
739 'title': playlist_title,
740 'description': playlist_desc,
741 'entries': entries,
742 }
743
c8e9a235
PH
744 def _extract_camtasia(self, url, video_id, webpage):
745 """ Returns None if no camtasia video can be found. """
746
747 camtasia_cfg = self._search_regex(
748 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
749 webpage, 'camtasia configuration file', default=None)
750 if camtasia_cfg is None:
751 return None
752
753 title = self._html_search_meta('DC.title', webpage, fatal=True)
754
755 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
756 camtasia_cfg = self._download_xml(
757 camtasia_url, video_id,
758 note='Downloading camtasia configuration',
759 errnote='Failed to download camtasia configuration')
760 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
761
762 entries = []
763 for n in fileset_node.getchildren():
764 url_n = n.find('./uri')
765 if url_n is None:
766 continue
767
768 entries.append({
769 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
770 'title': '%s - %s' % (title, n.tag),
771 'url': compat_urlparse.urljoin(url, url_n.text),
772 'duration': float_or_none(n.find('./duration').text),
773 })
774
775 return {
776 '_type': 'playlist',
777 'entries': entries,
778 'title': title,
779 }
780
9b122384 781 def _real_extract(self, url):
ebd3c7b3
PH
782 if url.startswith('//'):
783 return {
784 '_type': 'url',
20991253 785 'url': self.http_scheme() + url,
ebd3c7b3
PH
786 }
787
a7130543
JMF
788 parsed_url = compat_urlparse.urlparse(url)
789 if not parsed_url.scheme:
04b4d394
PH
790 default_search = self._downloader.params.get('default_search')
791 if default_search is None:
1f7ccb90 792 default_search = 'fixup_error'
04b4d394 793
1f7ccb90 794 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
795 if '/' in url:
796 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
797 return self.url_result('http://' + url)
1f7ccb90 798 elif default_search != 'fixup_error':
9c1fc022 799 if default_search == 'auto_warning':
0e67ab0d
PH
800 if re.match(r'^(?:url|URL)$', url):
801 raise ExtractorError(
802 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
803 expected=True)
804 else:
805 self._downloader.report_warning(
7571c02c 806 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 807 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
808
809 if default_search in ('error', 'fixup_error'):
7571c02c 810 raise ExtractorError(
b74e86f4
PH
811 '%r is not a valid URL. '
812 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
813 % (url, url), expected=True)
04b4d394 814 else:
f2f2c0c2
PH
815 if ':' not in default_search:
816 default_search += ':'
04b4d394 817 return self.url_result(default_search + url)
4d54ef20
PH
818
819 url, smuggled_data = unsmuggle_url(url)
820 force_videoid = None
d6e6a422 821 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
822 if smuggled_data and 'force_videoid' in smuggled_data:
823 force_videoid = smuggled_data['force_videoid']
824 video_id = force_videoid
825 else:
826 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 827
79649588 828 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 829
ebab4520 830 head_req = HEADRequest(url)
23be51d8 831 head_response = self._request_webpage(
ebab4520
PH
832 head_req, video_id,
833 note=False, errnote='Could not send HEAD request to %s' % url,
834 fatal=False)
42393ce2 835
23be51d8 836 if head_response is not False:
42393ce2 837 # Check for redirect
23be51d8 838 new_url = head_response.geturl()
42393ce2
PH
839 if url != new_url:
840 self.report_following_redirect(new_url)
4d54ef20
PH
841 if force_videoid:
842 new_url = smuggle_url(
843 new_url, {'force_videoid': force_videoid})
cecaaf3f 844 return self.url_result(new_url)
42393ce2 845
23be51d8
PH
846 full_response = None
847 if head_response is False:
848 full_response = self._request_webpage(url, video_id)
849 head_response = full_response
850
851 # Check for direct link to a video
852 content_type = head_response.headers.get('Content-Type', '')
853 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
854 if m:
855 upload_date = unified_strdate(
856 head_response.headers.get('Last-Modified'))
857 return {
858 'id': video_id,
859 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 860 'direct': True,
23be51d8
PH
861 'formats': [{
862 'format_id': m.group('format_id'),
863 'url': url,
864 'vcodec': 'none' if m.group('type') == 'audio' else None
865 }],
866 'upload_date': upload_date,
867 }
42393ce2 868
d6e6a422
PH
869 if not self._downloader.params.get('test', False) and not is_intentional:
870 self._downloader.report_warning('Falling back on generic information extractor.')
871
4e262a88
PH
872 if not full_response:
873 full_response = self._request_webpage(url, video_id)
874
875 # Maybe it's a direct link to a video?
876 # Be careful not to download the whole thing!
877 first_bytes = full_response.read(512)
61ca9a80 878 if not is_html(first_bytes):
4e262a88
PH
879 self._downloader.report_warning(
880 'URL could be a direct video link, returning it as such.')
881 upload_date = unified_strdate(
882 head_response.headers.get('Last-Modified'))
883 return {
884 'id': video_id,
885 'title': os.path.splitext(url_basename(url))[0],
886 'direct': True,
887 'url': url,
888 'upload_date': upload_date,
889 }
890
891 webpage = self._webpage_read_content(
892 full_response, url, video_id, prefix=first_bytes)
893
9b122384 894 self.report_extraction(video_id)
887c6acd 895
4fc946b5
PH
896 # Is it an RSS feed?
897 try:
bcf89ce6 898 doc = parse_xml(webpage)
4fc946b5
PH
899 if doc.tag == 'rss':
900 return self._extract_rss(url, video_id, doc)
f7300c5c 901 except compat_xml_parse_error:
4fc946b5
PH
902 pass
903
c8e9a235
PH
904 # Is it a Camtasia project?
905 camtasia_res = self._extract_camtasia(url, video_id, webpage)
906 if camtasia_res is not None:
907 return camtasia_res
908
14390730
S
909 # Sometimes embedded video player is hidden behind percent encoding
910 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
911 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
912 webpage = compat_urllib_parse.unquote(webpage)
913
887c6acd
PH
914 # it's tempting to parse this further, but you would
915 # have to take into account all the variations like
916 # Video Title - Site Name
917 # Site Name | Video Title
918 # Video Title - Tagline | Site Name
919 # and so on and so forth; it's just not practical
ef4fd848 920 video_title = self._html_search_regex(
79649588
PH
921 r'(?s)<title>(.*?)</title>', webpage, 'video title',
922 default='video')
ef4fd848 923
4d805e06
PH
924 # Try to detect age limit automatically
925 age_limit = self._rta_search(webpage)
926 # And then there are the jokers who advertise that they use RTA,
927 # but actually don't.
928 AGE_LIMIT_MARKERS = [
929 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
930 ]
931 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
932 age_limit = 18
933
ef4fd848
PH
934 # video uploader is domain name
935 video_uploader = self._search_regex(
79649588 936 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 937
ed2d6a19 938 # Helper method
83992676 939 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 940 urlrs = orderedSet(
83992676 941 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 942 for m in matches)
ed2d6a19
PH
943 return self.playlist_result(
944 urlrs, playlist_id=video_id, playlist_title=video_title)
945
627a91a9 946 # Look for BrightCove:
99877772
PH
947 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
948 if bc_urls:
79649588 949 self.to_screen('Brightcove video detected.')
99877772
PH
950 entries = [{
951 '_type': 'url',
952 'url': smuggle_url(bc_url, {'Referer': url}),
953 'ie_key': 'Brightcove'
954 } for bc_url in bc_urls]
955
956 return {
957 '_type': 'playlist',
958 'title': video_title,
959 'id': video_id,
960 'entries': entries,
961 }
cfe50f04 962
59b8ab58
PH
963 # Look for embedded rtl.nl player
964 matches = re.findall(
965 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
966 webpage)
967 if matches:
968 return _playlist_from_matches(matches, ie='RtlNl')
969
7115ca84 970 # Look for embedded (iframe) Vimeo player
9d4660ca 971 mobj = re.search(
15fd51b3 972 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 973 if mobj:
15fd51b3 974 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 975 surl = smuggle_url(player_url, {'Referer': url})
09a42738 976 return self.url_result(surl)
7115ca84
PH
977 # Look for embedded (swf embed) Vimeo player
978 mobj = re.search(
09a42738 979 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 980 if mobj:
09a42738 981 return self.url_result(mobj.group(1))
7115ca84 982
53c1d3ef 983 # Look for embedded YouTube player
1f9da904 984 matches = re.findall(r'''(?x)
2b88feed
PH
985 (?:
986 <iframe[^>]+?src=|
c71dfccc 987 data-video-url=|
2b88feed 988 <embed[^>]+?src=|
a7e97f6d
PH
989 embedSWF\(?:\s*|
990 new\s+SWFObject\(
2b88feed
PH
991 )
992 (["\'])
1bf5423e 993 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 994 (?:embed|v|p)/.+?)
1f9da904 995 \1''', webpage)
887c6acd 996 if matches:
ed2d6a19 997 return _playlist_from_matches(
3b2f933b 998 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 999
65f3a228
PH
1000 # Look for lazyYT YouTube embed
1001 matches = re.findall(
1002 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1003 if matches:
1004 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1005
355e4fd0
PH
1006 # Look for embedded Dailymotion player
1007 matches = re.findall(
ef4fd848 1008 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 1009 if matches:
ed2d6a19
PH
1010 return _playlist_from_matches(
1011 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 1012
8489578d
NJ
1013 # Look for embedded Dailymotion playlist player (#3822)
1014 m = re.search(
1015 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1016 if m:
1017 playlists = re.findall(
1018 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1019 if playlists:
1020 return _playlist_from_matches(
1021 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1022
ef4fd848
PH
1023 # Look for embedded Wistia player
1024 match = re.search(
281d3f1d 1025 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 1026 if match:
9471c444
NJ
1027 embed_url = self._proto_relative_url(
1028 unescapeHTML(match.group('url')))
ef4fd848
PH
1029 return {
1030 '_type': 'url_transparent',
9471c444 1031 'url': embed_url,
ef4fd848
PH
1032 'ie_key': 'Wistia',
1033 'uploader': video_uploader,
1034 'title': video_title,
1035 'id': video_id,
1036 }
5f6a1245 1037
9471c444 1038 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
1039 if match:
1040 return {
1041 '_type': 'url_transparent',
1042 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1043 'ie_key': 'Wistia',
1044 'uploader': video_uploader,
1045 'title': video_title,
1046 'id': match.group('id')
1047 }
ef4fd848 1048
ee3e63e4 1049 # Look for embedded blip.tv player
19dab5e6 1050 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 1051 if mobj:
2514d263 1052 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 1053 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 1054 if mobj:
19dab5e6 1055 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 1056
fa35cdad
PH
1057 # Look for embedded condenast player
1058 matches = re.findall(
1059 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1060 webpage)
1061 if matches:
1062 return {
1063 '_type': 'playlist',
1064 'entries': [{
1065 '_type': 'url',
1066 'ie_key': 'CondeNast',
1067 'url': ma,
1068 } for ma in matches],
1069 'title': video_title,
1070 'id': video_id,
1071 }
1072
c19f7764
JMF
1073 # Look for Bandcamp pages with custom domain
1074 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1075 if mobj is not None:
1076 burl = unescapeHTML(mobj.group(1))
09804265
JMF
1077 # Don't set the extractor because it can be a track url or an album
1078 return self.url_result(burl)
c19f7764 1079
f25571ff
PH
1080 # Look for embedded Vevo player
1081 mobj = re.search(
1082 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1083 if mobj is not None:
1084 return self.url_result(mobj.group('url'))
796df3c6
S
1085
1086 # Look for embedded Viddler player
cb454b33
S
1087 mobj = re.search(
1088 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1089 webpage)
796df3c6
S
1090 if mobj is not None:
1091 return self.url_result(mobj.group('url'))
f25571ff 1092
3378d67a
S
1093 # Look for NYTimes player
1094 mobj = re.search(
1095 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1096 webpage)
1097 if mobj is not None:
1098 return self.url_result(mobj.group('url'))
1099
cefdf970
S
1100 # Look for Libsyn player
1101 mobj = re.search(
1102 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1103 if mobj is not None:
1104 return self.url_result(mobj.group('url'))
1105
c0d0b01f 1106 # Look for Ooyala videos
cb454b33 1107 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 1108 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
edfcf7ab
YCH
1109 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1110 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 1111 if mobj is not None:
750f9020 1112 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 1113
f076b638 1114 # Look for multiple Ooyala embeds on SBN network websites
1115 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1116 if mobj is not None:
1117 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1118 if embeds:
1119 return _playlist_from_matches(
1120 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1121
aa94a6d3 1122 # Look for Aparat videos
48099643 1123 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1124 if mobj is not None:
1125 return self.url_result(mobj.group(1), 'Aparat')
1126
c93c2ab1 1127 # Look for MPORA videos
c3f51436 1128 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1129 if mobj is not None:
1130 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1131
15c0e8e7 1132 # Look for embedded NovaMov-based player
8f89e687 1133 mobj = re.search(
8dfa187b 1134 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1135 (?P<url>http://(?:(?:embed|www)\.)?
1136 (?:novamov\.com|
1137 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1138 videoweed\.(?:es|com)|
1139 movshare\.(?:net|sx|ag)|
1140 divxstage\.(?:eu|net|ch|co|at|ag))
1141 /embed\.php.+?)\1''', webpage)
8f89e687 1142 if mobj is not None:
15c0e8e7 1143 return self.url_result(mobj.group('url'))
50f56607 1144
9834872b
PH
1145 # Look for embedded Facebook player
1146 mobj = re.search(
db1f3888 1147 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1148 if mobj is not None:
1149 return self.url_result(mobj.group('url'), 'Facebook')
1150
ca97a56e
S
1151 # Look for embedded VK player
1152 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1153 if mobj is not None:
1154 return self.url_result(mobj.group('url'), 'VK')
1155
0364fa8b
S
1156 # Look for embedded ivi player
1157 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1158 if mobj is not None:
1159 return self.url_result(mobj.group('url'), 'Ivi')
1160
db1f3888
PH
1161 # Look for embedded Huffington Post player
1162 mobj = re.search(
c3f51436 1163 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1164 if mobj is not None:
1165 return self.url_result(mobj.group('url'), 'HuffPost')
1166
1b86cc41 1167 # Look for embed.ly
1168 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1169 if mobj is not None:
1170 return self.url_result(mobj.group('url'))
1171 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1172 if mobj is not None:
1173 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1174
60cc4dc4
PH
1175 # Look for funnyordie embed
1176 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1177 if matches:
ed2d6a19
PH
1178 return _playlist_from_matches(
1179 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1180
db546cf8
S
1181 # Look for BBC iPlayer embed
1182 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1183 if matches:
476eae0c 1184 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1185
93d020dd
S
1186 # Look for embedded RUTV player
1187 rutv_url = RUTVIE._extract_url(webpage)
1188 if rutv_url:
1189 return self.url_result(rutv_url, 'RUTV')
1190
7e2ede98
JMF
1191 # Look for embedded TED player
1192 mobj = re.search(
d7cc31b6 1193 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1194 if mobj is not None:
1195 return self.url_result(mobj.group('url'), 'TED')
1196
5c386252 1197 # Look for embedded Ustream videos
1198 mobj = re.search(
1199 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1200 if mobj is not None:
1201 return self.url_result(mobj.group('url'), 'Ustream')
1202
893f8832
PH
1203 # Look for embedded arte.tv player
1204 mobj = re.search(
1205 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1206 webpage)
1207 if mobj is not None:
1208 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1209
cb3ac1c6
S
1210 # Look for embedded smotri.com player
1211 smotri_url = SmotriIE._extract_url(webpage)
1212 if smotri_url:
1213 return self.url_result(smotri_url, 'Smotri')
1214
20991253
PH
1215 # Look for embeded soundcloud player
1216 mobj = re.search(
ac645ac7 1217 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1218 webpage)
1219 if mobj is not None:
1220 url = unescapeHTML(mobj.group('url'))
1221 return self.url_result(url)
1222
826ec77f
PH
1223 # Look for embedded vulture.com player
1224 mobj = re.search(
1225 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1226 webpage)
1227 if mobj is not None:
1228 url = unescapeHTML(mobj.group('url'))
1229 return self.url_result(url, ie='Vulture')
1230
c5cd249e
JMF
1231 # Look for embedded mtvservices player
1232 mobj = re.search(
1233 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1234 webpage)
1235 if mobj is not None:
1236 url = unescapeHTML(mobj.group('url'))
1237 return self.url_result(url, ie='MTVServicesEmbedded')
1238
49807b4a
S
1239 # Look for embedded yahoo player
1240 mobj = re.search(
1241 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1242 webpage)
1243 if mobj is not None:
1244 return self.url_result(mobj.group('url'), 'Yahoo')
1245
2ef6fcb5
PH
1246 # Look for embedded sbs.com.au player
1247 mobj = re.search(
e98b8e79
PH
1248 r'''(?x)
1249 (?:
1250 <meta\s+property="og:video"\s+content=|
1251 <iframe[^>]+?src=
1252 )
1253 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1254 webpage)
1255 if mobj is not None:
1256 return self.url_result(mobj.group('url'), 'SBS')
1257
42bdd9d0
PH
1258 # Look for embedded Cinchcast player
1259 mobj = re.search(
1260 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1261 webpage)
1262 if mobj is not None:
1263 return self.url_result(mobj.group('url'), 'Cinchcast')
1264
1a94ff68 1265 mobj = re.search(
5263cdfc 1266 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
1267 webpage)
1268 if mobj is not None:
1269 return self.url_result(mobj.group('url'), 'MLB')
1270
1419fafd
S
1271 mobj = re.search(
1272 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1273 webpage)
1274 if mobj is not None:
1275 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1276
af63fed7
PH
1277 mobj = re.search(
1278 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1279 webpage)
1280 if mobj is not None:
1281 return self.url_result(mobj.group('url'), 'Livestream')
1282
255fca5e
S
1283 # Look for Zapiks embed
1284 mobj = re.search(
1285 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1286 if mobj is not None:
1287 return self.url_result(mobj.group('url'), 'Zapiks')
1288
e3216b82
NJ
1289 # Look for Kaltura embeds
1290 mobj = re.search(
1291 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1292 if mobj is not None:
1293 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1294
135c9c42
S
1295 # Look for Eagle.Platform embeds
1296 mobj = re.search(
1297 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1298 if mobj is not None:
1299 return self.url_result(mobj.group('url'), 'EaglePlatform')
1300
d47ae7f6
S
1301 # Look for ClipYou (uses Eagle.Platform) embeds
1302 mobj = re.search(
1303 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1304 if mobj is not None:
1305 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1306
f8388757
S
1307 # Look for Pladform embeds
1308 mobj = re.search(
1309 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1310 if mobj is not None:
1311 return self.url_result(mobj.group('url'), 'Pladform')
1312
2dcc114f
S
1313 # Look for Playwire embeds
1314 mobj = re.search(
1315 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1316 if mobj is not None:
1317 return self.url_result(mobj.group('url'))
1318
ad320e9b
NJ
1319 # Look for 5min embeds
1320 mobj = re.search(
1321 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1322 if mobj is not None:
1323 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1324
18153f1b
S
1325 # Look for Crooks and Liars embeds
1326 mobj = re.search(
1327 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1328 if mobj is not None:
1329 return self.url_result(mobj.group('url'))
1330
a2edf2e7
YCH
1331 # Look for NBC Sports VPlayer embeds
1332 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1333 if nbc_sports_url:
1334 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1335
418c5cc3
YCH
1336 # Look for UDN embeds
1337 mobj = re.search(
1338 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1339 if mobj is not None:
1340 return self.url_result(
0a160363 1341 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
418c5cc3 1342
ced659bb 1343 def check_video(vurl):
a0f71985
PH
1344 if YoutubeIE.suitable(vurl):
1345 return True
ced659bb
S
1346 vpath = compat_urlparse.urlparse(vurl).path
1347 vext = determine_ext(vpath)
1348 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1349
1350 def filter_video(urls):
1351 return list(filter(check_video, urls))
1352
9b122384 1353 # Start with something easy: JW Player in SWFObject
ced659bb 1354 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1355 if not found:
d981cef6 1356 # Look for gorilla-vid style embedding
ced659bb 1357 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1358 (?:
1359 jw_plugins|
1360 JWPlayerOptions|
1361 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1362 )
a0f71985
PH
1363 .*?
1364 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1365 if not found:
9b122384 1366 # Broaden the search a little bit
ced659bb 1367 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1368 if not found:
1369 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1370 found = filter_video(re.findall(
1371 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1372 if not found:
1373 # Flow player
ced659bb 1374 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1375 flowplayer\("[^"]+",\s*
1376 \{[^}]+?\}\s*,
52585fd6 1377 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1378 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1379 ''', webpage))
501f13fb
PH
1380 if not found:
1381 # Cinerama player
1382 found = re.findall(
1383 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1384 if not found:
9b122384 1385 # Try to find twitter cards info
ced659bb
S
1386 found = filter_video(re.findall(
1387 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1388 if not found:
9b122384
PH
1389 # We look for Open Graph info:
1390 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1391 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1392 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1393 if m_video_type is not None:
ced659bb 1394 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1395 if not found:
7fea7156 1396 # HTML5 video
9b32eca3 1397 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1398 if not found:
ed9a25dd 1399 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
a5a45015 1400 found = re.search(
89ef304b 1401 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
ed9a25dd 1402 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
89ef304b 1403 webpage)
84f81016
S
1404 if not found:
1405 # Look also in Refresh HTTP header
1406 refresh_header = head_response.headers.get('Refresh')
1407 if refresh_header:
ed9a25dd 1408 found = re.search(REDIRECT_REGEX, refresh_header)
b30b8698
PH
1409 if found:
1410 new_url = found.group(1)
89ef304b
PH
1411 self.report_following_redirect(new_url)
1412 return {
1413 '_type': 'url',
1414 'url': new_url,
1415 }
b30b8698 1416 if not found:
416c7fcb 1417 raise UnsupportedError(url)
9b122384 1418
b30b8698
PH
1419 entries = []
1420 for video_url in found:
1421 video_url = compat_urlparse.urljoin(url, video_url)
1422 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1423
b30b8698
PH
1424 # Sometimes, jwplayer extraction will result in a YouTube URL
1425 if YoutubeIE.suitable(video_url):
1426 entries.append(self.url_result(video_url, 'Youtube'))
1427 continue
9b122384 1428
b30b8698
PH
1429 # here's a fun little line of code for you:
1430 video_id = os.path.splitext(video_id)[0]
fc9713a1 1431
b30b8698
PH
1432 entries.append({
1433 'id': video_id,
1434 'url': video_url,
1435 'uploader': video_uploader,
1436 'title': video_title,
4d805e06 1437 'age_limit': age_limit,
b30b8698
PH
1438 })
1439
1440 if len(entries) == 1:
669f0e7c 1441 return entries[0]
b30b8698
PH
1442 else:
1443 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1444 # 'url' results don't have a title
1445 if e.get('title') is not None:
1446 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1447 return {
1448 '_type': 'playlist',
1449 'entries': entries,
1450 }