]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[generic] Add support for Crooks and Liars embeds
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
76c73715 29 xpath_text,
9b122384 30)
cfe50f04 31from .brightcove import BrightcoveIE
a2edf2e7 32from .nbc import NBCSportsVPlayerIE
c0d0b01f 33from .ooyala import OoyalaIE
93d020dd 34from .rutv import RUTVIE
cb3ac1c6 35from .smotri import SmotriIE
1419fafd 36from .condenast import CondeNastIE
418c5cc3 37from .udn import UDNEmbedIE
9b122384 38
0838239e 39
9b122384 40class GenericIE(InfoExtractor):
79649588 41 IE_DESC = 'Generic downloader that works on some sites'
9b122384 42 _VALID_URL = r'.*'
79649588 43 IE_NAME = 'generic'
cfe50f04
JMF
44 _TESTS = [
45 {
79649588 46 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 47 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 48 'info_dict': {
d360a146
S
49 'id': '13601338388002',
50 'ext': 'mp4',
79649588
PH
51 'uploader': 'www.hodiho.fr',
52 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
53 }
54 },
c19f7764
JMF
55 # bandcamp page with custom domain
56 {
79649588
PH
57 'add_ie': ['Bandcamp'],
58 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 59 'info_dict': {
fd50bf62
S
60 'id': '3235767654',
61 'ext': 'mp3',
79649588
PH
62 'title': 'The Pony Mash',
63 'uploader': 'M_Pallante',
c19f7764 64 },
79649588 65 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 66 },
eeb165e6 67 # embedded brightcove video
dd5bcdc4
JMF
68 # it also tests brightcove videos that need to set the 'Referer' in the
69 # http requests
eeb165e6 70 {
79649588
PH
71 'add_ie': ['Brightcove'],
72 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73 'info_dict': {
74 'id': '2765128793001',
75 'ext': 'mp4',
76 'title': 'Le cours de bourse : l’analyse technique',
77 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
78 'uploader': 'BFM BUSINESS',
eeb165e6 79 },
79649588
PH
80 'params': {
81 'skip_download': True,
eeb165e6
JMF
82 },
83 },
17ab4d3b
PH
84 {
85 # https://github.com/rg3/youtube-dl/issues/2253
86 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
87 'md5': '0ba9446db037002366bab3b3eb30c88c',
88 'info_dict': {
fd50bf62
S
89 'id': '3101154703001',
90 'ext': 'mp4',
17ab4d3b
PH
91 'title': 'Still no power',
92 'uploader': 'thestar.com',
93 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94 },
95 'add_ie': ['Brightcove'],
96 },
0479c625
S
97 {
98 'url': 'http://www.championat.com/video/football/v/87/87499.html',
99 'md5': 'fb973ecf6e4a78a67453647444222983',
100 'info_dict': {
101 'id': '3414141473001',
102 'ext': 'mp4',
103 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
104 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
105 'uploader': 'Championat',
106 },
107 },
bdf97017 108 {
37aab278 109 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
110 'add_ie': ['Brightcove'],
111 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112 'info_dict': {
113 'id': '3866516442001',
37aab278 114 'ext': 'mp4',
bdf97017
NJ
115 'title': 'Leer mij vrouwen kennen: Aflevering 1',
116 'description': 'Leer mij vrouwen kennen: Aflevering 1',
117 'uploader': 'SBS Broadcasting',
118 },
37aab278 119 'skip': 'Restricted to Netherlands',
bdf97017 120 'params': {
37aab278 121 'skip_download': True, # m3u8 download
bdf97017
NJ
122 },
123 },
42393ce2
PH
124 # Direct link to a video
125 {
79649588 126 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
127 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
128 'info_dict': {
129 'id': 'trailer',
89ef304b 130 'ext': 'mp4',
79649588
PH
131 'title': 'trailer',
132 'upload_date': '20100513',
42393ce2 133 }
c0d0b01f
JMF
134 },
135 # ooyala video
136 {
79649588 137 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 138 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
139 'info_dict': {
140 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141 'ext': 'mp4',
3486df38 142 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 143 },
87830900 144 'add_ie': ['Ooyala'],
c0d0b01f 145 },
f076b638 146 # multiple ooyala embeds on SBN network websites
147 {
148 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'info_dict': {
150 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
151 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152 },
153 'playlist_mincount': 3,
154 'params': {
155 'skip_download': True,
156 },
157 'add_ie': ['Ooyala'],
158 },
89ef304b
PH
159 # google redirect
160 {
161 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
162 'info_dict': {
163 'id': 'cmQHVoWB5FY',
164 'ext': 'mp4',
165 'upload_date': '20130224',
166 'uploader_id': 'TheVerge',
87830900 167 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
168 'uploader': 'The Verge',
169 'title': 'First Firefox OS phones side-by-side',
170 },
171 'params': {
172 'skip_download': False,
173 }
f55a1f0a 174 },
1b86cc41 175 # embed.ly video
176 {
177 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
178 'info_dict': {
179 'id': '9ODmcdjQcHQ',
180 'ext': 'mp4',
0a5bce56
PH
181 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
182 'upload_date': '20140225',
183 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
184 'uploader': 'Tested',
185 'uploader_id': 'testedcom',
1b86cc41 186 },
187 # No need to test YoutubeIE here
188 'params': {
189 'skip_download': True,
190 },
191 },
60cc4dc4
PH
192 # funnyordie embed
193 {
194 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
195 'info_dict': {
196 'id': '18e820ec3f',
197 'ext': 'mp4',
198 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
199 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 200 },
60cc4dc4 201 },
faa4ea68
S
202 # BBC iPlayer embeds
203 {
204 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205 'info_dict': {
206 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
207 },
208 'playlist_mincount': 18,
209 },
93d020dd
S
210 # RUTV embed
211 {
212 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
213 'info_dict': {
214 'id': '776940',
215 'ext': 'mp4',
216 'title': 'Охотское море стало целиком российским',
217 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
218 },
219 'params': {
220 # m3u8 download
221 'skip_download': True,
222 },
aab74fa1
PH
223 },
224 # Embedded TED video
225 {
226 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 227 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 228 'info_dict': {
a8eb5a8e 229 'id': '1969',
aab74fa1 230 'ext': 'mp4',
a8eb5a8e
PH
231 'title': 'Hidden miracles of the natural world',
232 'uploader': 'Louie Schwartzberg',
233 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 234 }
60cc4dc4 235 },
5c386252 236 # Embeded Ustream video
237 {
238 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
239 'md5': '27b99cdb639c9b12a79bca876a073417',
240 'info_dict': {
ca6aada4 241 'id': '45734260',
242 'ext': 'flv',
243 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 244 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
245 }
246 },
d95e35d6
S
247 # nowvideo embed hidden behind percent encoding
248 {
249 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
250 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251 'info_dict': {
252 'id': '06e53103ca9aa',
253 'ext': 'flv',
254 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
255 'description': 'No description',
256 },
0f2a2ba1 257 },
893f8832
PH
258 # arte embed
259 {
260 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
261 'md5': '7653032cbb25bf6c80d80f217055fa43',
262 'info_dict': {
263 'id': '048195-004_PLUS7-F',
264 'ext': 'flv',
265 'title': 'X:enius',
266 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
267 'upload_date': '20140320',
268 },
269 'params': {
270 'skip_download': 'Requires rtmpdump'
271 }
272 },
fa35cdad
PH
273 # Condé Nast embed
274 {
275 'url': 'http://www.wired.com/2014/04/honda-asimo/',
276 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277 'info_dict': {
278 'id': '53501be369702d3275860000',
279 'ext': 'mp4',
280 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
281 }
ebd3c7b3
PH
282 },
283 # Dailymotion embed
284 {
285 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
286 'md5': '441aeeb82eb72c422c7f14ec533999cd',
287 'info_dict': {
288 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289 'ext': 'mp4',
290 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
291 'uploader': 'Spi0n',
292 },
293 'add_ie': ['Dailymotion'],
2b88feed
PH
294 },
295 # YouTube embed
296 {
297 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
298 'info_dict': {
299 'id': 'FXRb4ykk4S0',
300 'ext': 'mp4',
301 'title': 'The NBL Auction 2014',
302 'uploader': 'BADMINTON England',
303 'uploader_id': 'BADMINTONEvents',
304 'upload_date': '20140603',
305 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306 },
307 'add_ie': ['Youtube'],
308 'params': {
309 'skip_download': True,
310 }
311 },
c5cd249e
JMF
312 # MTVSercices embed
313 {
314 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
315 'md5': '35727f82f58c76d996fc188f9755b0d5',
316 'info_dict': {
317 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
318 'ext': 'mp4',
319 'title': 'Review',
320 'description': 'Mario\'s life in the fast lane has never looked so good.',
321 },
322 },
61013473 323 # YouTube embed via <data-embed-url="">
324 {
325 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 326 'info_dict': {
a8eb5a8e 327 'id': '4vAffPZIT44',
61013473 328 'ext': 'mp4',
a8eb5a8e 329 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
330 'uploader': 'Gameloft',
331 'uploader_id': 'gameloft',
a8eb5a8e
PH
332 'upload_date': '20140828',
333 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
334 },
335 'params': {
336 'skip_download': True,
61013473 337 }
c8e9a235
PH
338 },
339 # Camtasia studio
340 {
341 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342 'playlist': [{
343 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344 'info_dict': {
345 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
346 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
347 'ext': 'flv',
348 'duration': 2235.90,
349 }
350 }, {
351 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352 'info_dict': {
353 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
354 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
355 'ext': 'flv',
356 'duration': 2235.93,
357 }
358 }],
359 'info_dict': {
360 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
361 }
4d805e06
PH
362 },
363 # Flowplayer
364 {
365 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
366 'md5': '9d65602bf31c6e20014319c7d07fba27',
367 'info_dict': {
368 'id': '5123ea6d5e5a7',
369 'ext': 'mp4',
370 'age_limit': 18,
371 'uploader': 'www.handjobhub.com',
d6d9186f 372 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 373 }
0990305d
PH
374 },
375 # RSS feed
376 {
377 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'info_dict': {
379 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
380 'title': 'Zero Punctuation',
b1b0b1ca 381 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
382 },
383 'playlist_mincount': 11,
22a6f150
PH
384 },
385 # Multiple brightcove videos
386 # https://github.com/rg3/youtube-dl/issues/2283
387 {
388 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389 'info_dict': {
390 'id': 'always-never',
391 'title': 'Always / Never - The New Yorker',
392 },
393 'playlist_count': 3,
394 'params': {
395 'extract_flat': False,
396 'skip_download': True,
397 }
1a94ff68
S
398 },
399 # MLB embed
400 {
401 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
402 'md5': '96f09a37e44da40dd083e12d9a683327',
403 'info_dict': {
404 'id': '33322633',
405 'ext': 'mp4',
406 'title': 'Ump changes call to ball',
407 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408 'duration': 48,
409 'timestamp': 1401537900,
410 'upload_date': '20140531',
411 'thumbnail': 're:^https?://.*\.jpg$',
412 },
413 },
746c67d7
NJ
414 # Wistia embed
415 {
416 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
417 'md5': '8788b683c777a5cf25621eaf286d0c23',
418 'info_dict': {
419 'id': '1cfaf6b7ea',
420 'ext': 'mov',
421 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422 'duration': 643.0,
423 'filesize': 182808282,
424 'uploader': 'education-portal.com',
425 },
426 },
52cffcb1 427 {
428 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
429 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
430 'info_dict': {
431 'id': 'uxjb0lwrcz',
432 'ext': 'mp4',
85d7b765 433 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 434 'duration': 1715.0,
85d7b765 435 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 436 },
52cffcb1 437 },
70b7e3fb
PH
438 # Direct download with broken HEAD
439 {
440 'url': 'http://ai-radio.org:8000/radio.opus',
441 'info_dict': {
442 'id': 'radio',
443 'ext': 'opus',
444 'title': 'radio',
445 },
446 'params': {
447 'skip_download': True, # infinite live stream
448 },
449 'expected_warnings': [
450 r'501.*Not Implemented'
451 ],
ac645ac7
PH
452 },
453 # Soundcloud embed
454 {
455 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
456 'info_dict': {
457 'id': '174391317',
458 'ext': 'mp3',
459 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
460 'uploader': 'Sophos Security',
461 'title': 'Chet Chat 171 - Oct 29, 2014',
462 'upload_date': '20141029',
463 }
af63fed7
PH
464 },
465 # Livestream embed
466 {
467 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
468 'info_dict': {
469 'id': '67864563',
470 'ext': 'flv',
471 'upload_date': '20141112',
472 'title': 'Rosetta #CometLanding webcast HL 10',
473 }
474 },
65f3a228
PH
475 # LazyYT
476 {
477 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
478 'info_dict': {
11e611a7 479 'id': '1986',
65f3a228
PH
480 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481 },
482 'playlist_mincount': 2,
4e262a88
PH
483 },
484 # Direct link with incorrect MIME type
485 {
486 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
487 'md5': '4ccbebe5f36706d85221f204d7eb5913',
488 'info_dict': {
489 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
490 'id': '5_Lennart_Poettering_-_Systemd',
491 'ext': 'webm',
492 'title': '5_Lennart_Poettering_-_Systemd',
493 'upload_date': '20141120',
494 },
495 'expected_warnings': [
496 'URL could be a direct video link, returning it as such.'
497 ]
42bdd9d0
PH
498 },
499 # Cinchcast embed
500 {
501 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
502 'info_dict': {
503 'id': '7141703',
504 'ext': 'mp3',
505 'upload_date': '20141126',
506 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
507 }
508 },
501f13fb
PH
509 # Cinerama player
510 {
511 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512 'info_dict': {
513 'id': '730m_DandD_1901_512k',
514 'ext': 'mp4',
515 'uploader': 'www.abc.net.au',
516 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
517 }
796df3c6
S
518 },
519 # embedded viddler video
520 {
521 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
522 'info_dict': {
523 'id': '4d03aad9',
524 'ext': 'mp4',
525 'uploader': 'deadspin',
526 'title': 'WALL-TO-GORTAT',
527 'timestamp': 1422285291,
528 'upload_date': '20150126',
529 },
530 'add_ie': ['Viddler'],
a0f71985 531 },
2051acde
S
532 # Libsyn embed
533 {
534 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
535 'info_dict': {
536 'id': '3377616',
537 'ext': 'mp3',
538 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
539 'description': 'md5:601cb790edd05908957dae8aaa866465',
540 'upload_date': '20150220',
541 },
542 },
a0f71985
PH
543 # jwplayer YouTube
544 {
545 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
546 'info_dict': {
547 'id': 'Mrj4DVp2zeA',
548 'ext': 'mp4',
f37e3f99 549 'upload_date': '20150212',
a0f71985
PH
550 'uploader': 'The National Archives UK',
551 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
552 'uploader_id': 'NationalArchives08',
553 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
554 },
59b8ab58
PH
555 },
556 # rtl.nl embed
557 {
558 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
559 'playlist_mincount': 5,
560 'info_dict': {
561 'id': 'aanslagen-kopenhagen',
562 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
563 }
255fca5e
S
564 },
565 # Zapiks embed
566 {
567 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
568 'info_dict': {
569 'id': '118046',
570 'ext': 'mp4',
571 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
572 }
573 },
e3216b82
NJ
574 # Kaltura embed
575 {
576 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
577 'info_dict': {
578 'id': '1_eergr3h1',
579 'ext': 'mp4',
580 'upload_date': '20150226',
581 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582 'timestamp': int,
583 'title': 'John Carlson Postgame 2/25/15',
584 },
585 },
135c9c42
S
586 # Eagle.Platform embed (generic URL)
587 {
588 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
589 'info_dict': {
590 'id': '227304',
591 'ext': 'mp4',
592 'title': 'Навальный вышел на свободу',
593 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
594 'thumbnail': 're:^https?://.*\.jpg$',
595 'duration': 87,
596 'view_count': int,
597 'age_limit': 0,
598 },
599 },
d47ae7f6
S
600 # ClipYou (Eagle.Platform) embed (custom URL)
601 {
602 'url': 'http://muz-tv.ru/play/7129/',
603 'info_dict': {
604 'id': '12820',
605 'ext': 'mp4',
606 'title': "'O Sole Mio",
607 'thumbnail': 're:^https?://.*\.jpg$',
608 'duration': 216,
609 'view_count': int,
610 },
611 },
f8388757
S
612 # Pladform embed
613 {
614 'url': 'http://muz-tv.ru/kinozal/view/7400/',
615 'info_dict': {
616 'id': '100183293',
617 'ext': 'mp4',
618 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
619 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
620 'thumbnail': 're:^https?://.*\.jpg$',
621 'duration': 694,
622 'age_limit': 0,
623 },
624 },
ad320e9b
NJ
625 # 5min embed
626 {
627 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
628 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
629 'info_dict': {
630 'id': '518726732',
631 'ext': 'mp4',
632 'title': 'Facebook Creates "On This Day" | Crunch Report',
633 },
634 },
76c73715
PH
635 # RSS feed with enclosure
636 {
637 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
638 'info_dict': {
639 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
640 'ext': 'm4v',
641 'upload_date': '20150228',
642 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
643 }
a2edf2e7 644 },
facecb84 645 # NBC Sports vplayer embed
a2edf2e7 646 {
facecb84 647 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
a2edf2e7 648 'info_dict': {
facecb84
S
649 'id': 'ln7x1qSThw4k',
650 'ext': 'flv',
651 'title': "PFT Live: New leader in the 'new-look' defense",
652 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
a2edf2e7 653 },
418c5cc3
YCH
654 },
655 # UDN embed
656 {
657 'url': 'http://www.udn.com/news/story/7314/822787',
658 'md5': 'de06b4c90b042c128395a88f0384817e',
659 'info_dict': {
660 'id': '300040',
661 'ext': 'mp4',
662 'title': '生物老師男變女 全校挺"做自己"',
663 'thumbnail': 're:^https?://.*\.jpg$',
664 }
76c73715 665 }
cfe50f04 666 ]
9b122384 667
9b122384
PH
668 def report_following_redirect(self, new_url):
669 """Report information extraction."""
79649588 670 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 671
4fc946b5
PH
672 def _extract_rss(self, url, video_id, doc):
673 playlist_title = doc.find('./channel/title').text
674 playlist_desc_el = doc.find('./channel/description')
675 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
676
76c73715
PH
677 entries = []
678 for it in doc.findall('./channel/item'):
679 next_url = xpath_text(it, 'link', fatal=False)
680 if not next_url:
681 enclosure_nodes = it.findall('./enclosure')
682 for e in enclosure_nodes:
683 next_url = e.attrib.get('url')
684 if next_url:
685 break
686
687 if not next_url:
688 continue
689
690 entries.append({
691 '_type': 'url',
692 'url': next_url,
693 'title': it.find('title').text,
694 })
4fc946b5
PH
695
696 return {
697 '_type': 'playlist',
698 'id': url,
699 'title': playlist_title,
700 'description': playlist_desc,
701 'entries': entries,
702 }
703
c8e9a235
PH
704 def _extract_camtasia(self, url, video_id, webpage):
705 """ Returns None if no camtasia video can be found. """
706
707 camtasia_cfg = self._search_regex(
708 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
709 webpage, 'camtasia configuration file', default=None)
710 if camtasia_cfg is None:
711 return None
712
713 title = self._html_search_meta('DC.title', webpage, fatal=True)
714
715 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
716 camtasia_cfg = self._download_xml(
717 camtasia_url, video_id,
718 note='Downloading camtasia configuration',
719 errnote='Failed to download camtasia configuration')
720 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
721
722 entries = []
723 for n in fileset_node.getchildren():
724 url_n = n.find('./uri')
725 if url_n is None:
726 continue
727
728 entries.append({
729 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
730 'title': '%s - %s' % (title, n.tag),
731 'url': compat_urlparse.urljoin(url, url_n.text),
732 'duration': float_or_none(n.find('./duration').text),
733 })
734
735 return {
736 '_type': 'playlist',
737 'entries': entries,
738 'title': title,
739 }
740
9b122384 741 def _real_extract(self, url):
ebd3c7b3
PH
742 if url.startswith('//'):
743 return {
744 '_type': 'url',
20991253 745 'url': self.http_scheme() + url,
ebd3c7b3
PH
746 }
747
a7130543
JMF
748 parsed_url = compat_urlparse.urlparse(url)
749 if not parsed_url.scheme:
04b4d394
PH
750 default_search = self._downloader.params.get('default_search')
751 if default_search is None:
1f7ccb90 752 default_search = 'fixup_error'
04b4d394 753
1f7ccb90 754 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
755 if '/' in url:
756 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
757 return self.url_result('http://' + url)
1f7ccb90 758 elif default_search != 'fixup_error':
9c1fc022 759 if default_search == 'auto_warning':
0e67ab0d
PH
760 if re.match(r'^(?:url|URL)$', url):
761 raise ExtractorError(
762 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
763 expected=True)
764 else:
765 self._downloader.report_warning(
7571c02c 766 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 767 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
768
769 if default_search in ('error', 'fixup_error'):
7571c02c 770 raise ExtractorError(
b74e86f4
PH
771 '%r is not a valid URL. '
772 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
773 % (url, url), expected=True)
04b4d394 774 else:
f2f2c0c2
PH
775 if ':' not in default_search:
776 default_search += ':'
04b4d394 777 return self.url_result(default_search + url)
4d54ef20
PH
778
779 url, smuggled_data = unsmuggle_url(url)
780 force_videoid = None
d6e6a422 781 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
782 if smuggled_data and 'force_videoid' in smuggled_data:
783 force_videoid = smuggled_data['force_videoid']
784 video_id = force_videoid
785 else:
786 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 787
79649588 788 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 789
ebab4520 790 head_req = HEADRequest(url)
23be51d8 791 head_response = self._request_webpage(
ebab4520
PH
792 head_req, video_id,
793 note=False, errnote='Could not send HEAD request to %s' % url,
794 fatal=False)
42393ce2 795
23be51d8 796 if head_response is not False:
42393ce2 797 # Check for redirect
23be51d8 798 new_url = head_response.geturl()
42393ce2
PH
799 if url != new_url:
800 self.report_following_redirect(new_url)
4d54ef20
PH
801 if force_videoid:
802 new_url = smuggle_url(
803 new_url, {'force_videoid': force_videoid})
cecaaf3f 804 return self.url_result(new_url)
42393ce2 805
23be51d8
PH
806 full_response = None
807 if head_response is False:
808 full_response = self._request_webpage(url, video_id)
809 head_response = full_response
810
811 # Check for direct link to a video
812 content_type = head_response.headers.get('Content-Type', '')
813 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
814 if m:
815 upload_date = unified_strdate(
816 head_response.headers.get('Last-Modified'))
817 return {
818 'id': video_id,
819 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 820 'direct': True,
23be51d8
PH
821 'formats': [{
822 'format_id': m.group('format_id'),
823 'url': url,
824 'vcodec': 'none' if m.group('type') == 'audio' else None
825 }],
826 'upload_date': upload_date,
827 }
42393ce2 828
d6e6a422
PH
829 if not self._downloader.params.get('test', False) and not is_intentional:
830 self._downloader.report_warning('Falling back on generic information extractor.')
831
4e262a88
PH
832 if not full_response:
833 full_response = self._request_webpage(url, video_id)
834
835 # Maybe it's a direct link to a video?
836 # Be careful not to download the whole thing!
837 first_bytes = full_response.read(512)
61ca9a80 838 if not is_html(first_bytes):
4e262a88
PH
839 self._downloader.report_warning(
840 'URL could be a direct video link, returning it as such.')
841 upload_date = unified_strdate(
842 head_response.headers.get('Last-Modified'))
843 return {
844 'id': video_id,
845 'title': os.path.splitext(url_basename(url))[0],
846 'direct': True,
847 'url': url,
848 'upload_date': upload_date,
849 }
850
851 webpage = self._webpage_read_content(
852 full_response, url, video_id, prefix=first_bytes)
853
9b122384 854 self.report_extraction(video_id)
887c6acd 855
4fc946b5
PH
856 # Is it an RSS feed?
857 try:
bcf89ce6 858 doc = parse_xml(webpage)
4fc946b5
PH
859 if doc.tag == 'rss':
860 return self._extract_rss(url, video_id, doc)
f7300c5c 861 except compat_xml_parse_error:
4fc946b5
PH
862 pass
863
c8e9a235
PH
864 # Is it a Camtasia project?
865 camtasia_res = self._extract_camtasia(url, video_id, webpage)
866 if camtasia_res is not None:
867 return camtasia_res
868
14390730
S
869 # Sometimes embedded video player is hidden behind percent encoding
870 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
871 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
872 webpage = compat_urllib_parse.unquote(webpage)
873
887c6acd
PH
874 # it's tempting to parse this further, but you would
875 # have to take into account all the variations like
876 # Video Title - Site Name
877 # Site Name | Video Title
878 # Video Title - Tagline | Site Name
879 # and so on and so forth; it's just not practical
ef4fd848 880 video_title = self._html_search_regex(
79649588
PH
881 r'(?s)<title>(.*?)</title>', webpage, 'video title',
882 default='video')
ef4fd848 883
4d805e06
PH
884 # Try to detect age limit automatically
885 age_limit = self._rta_search(webpage)
886 # And then there are the jokers who advertise that they use RTA,
887 # but actually don't.
888 AGE_LIMIT_MARKERS = [
889 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
890 ]
891 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
892 age_limit = 18
893
ef4fd848
PH
894 # video uploader is domain name
895 video_uploader = self._search_regex(
79649588 896 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 897
ed2d6a19 898 # Helper method
83992676 899 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 900 urlrs = orderedSet(
83992676 901 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 902 for m in matches)
ed2d6a19
PH
903 return self.playlist_result(
904 urlrs, playlist_id=video_id, playlist_title=video_title)
905
627a91a9 906 # Look for BrightCove:
99877772
PH
907 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
908 if bc_urls:
79649588 909 self.to_screen('Brightcove video detected.')
99877772
PH
910 entries = [{
911 '_type': 'url',
912 'url': smuggle_url(bc_url, {'Referer': url}),
913 'ie_key': 'Brightcove'
914 } for bc_url in bc_urls]
915
916 return {
917 '_type': 'playlist',
918 'title': video_title,
919 'id': video_id,
920 'entries': entries,
921 }
cfe50f04 922
59b8ab58
PH
923 # Look for embedded rtl.nl player
924 matches = re.findall(
925 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
926 webpage)
927 if matches:
928 return _playlist_from_matches(matches, ie='RtlNl')
929
7115ca84 930 # Look for embedded (iframe) Vimeo player
9d4660ca 931 mobj = re.search(
15fd51b3 932 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 933 if mobj:
15fd51b3 934 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 935 surl = smuggle_url(player_url, {'Referer': url})
09a42738 936 return self.url_result(surl)
7115ca84
PH
937 # Look for embedded (swf embed) Vimeo player
938 mobj = re.search(
09a42738 939 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 940 if mobj:
09a42738 941 return self.url_result(mobj.group(1))
7115ca84 942
53c1d3ef 943 # Look for embedded YouTube player
1f9da904 944 matches = re.findall(r'''(?x)
2b88feed
PH
945 (?:
946 <iframe[^>]+?src=|
c71dfccc 947 data-video-url=|
2b88feed 948 <embed[^>]+?src=|
a7e97f6d
PH
949 embedSWF\(?:\s*|
950 new\s+SWFObject\(
2b88feed
PH
951 )
952 (["\'])
1bf5423e 953 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 954 (?:embed|v|p)/.+?)
1f9da904 955 \1''', webpage)
887c6acd 956 if matches:
ed2d6a19 957 return _playlist_from_matches(
3b2f933b 958 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 959
65f3a228
PH
960 # Look for lazyYT YouTube embed
961 matches = re.findall(
962 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
963 if matches:
964 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
965
355e4fd0
PH
966 # Look for embedded Dailymotion player
967 matches = re.findall(
ef4fd848 968 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 969 if matches:
ed2d6a19
PH
970 return _playlist_from_matches(
971 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 972
8489578d
NJ
973 # Look for embedded Dailymotion playlist player (#3822)
974 m = re.search(
975 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
976 if m:
977 playlists = re.findall(
978 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
979 if playlists:
980 return _playlist_from_matches(
981 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
982
ef4fd848
PH
983 # Look for embedded Wistia player
984 match = re.search(
281d3f1d 985 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 986 if match:
9471c444
NJ
987 embed_url = self._proto_relative_url(
988 unescapeHTML(match.group('url')))
ef4fd848
PH
989 return {
990 '_type': 'url_transparent',
9471c444 991 'url': embed_url,
ef4fd848
PH
992 'ie_key': 'Wistia',
993 'uploader': video_uploader,
994 'title': video_title,
995 'id': video_id,
996 }
5f6a1245 997
9471c444 998 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
999 if match:
1000 return {
1001 '_type': 'url_transparent',
1002 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1003 'ie_key': 'Wistia',
1004 'uploader': video_uploader,
1005 'title': video_title,
1006 'id': match.group('id')
1007 }
ef4fd848 1008
ee3e63e4 1009 # Look for embedded blip.tv player
19dab5e6 1010 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 1011 if mobj:
2514d263 1012 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 1013 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 1014 if mobj:
19dab5e6 1015 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 1016
fa35cdad
PH
1017 # Look for embedded condenast player
1018 matches = re.findall(
1019 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1020 webpage)
1021 if matches:
1022 return {
1023 '_type': 'playlist',
1024 'entries': [{
1025 '_type': 'url',
1026 'ie_key': 'CondeNast',
1027 'url': ma,
1028 } for ma in matches],
1029 'title': video_title,
1030 'id': video_id,
1031 }
1032
c19f7764
JMF
1033 # Look for Bandcamp pages with custom domain
1034 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1035 if mobj is not None:
1036 burl = unescapeHTML(mobj.group(1))
09804265
JMF
1037 # Don't set the extractor because it can be a track url or an album
1038 return self.url_result(burl)
c19f7764 1039
f25571ff
PH
1040 # Look for embedded Vevo player
1041 mobj = re.search(
1042 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1043 if mobj is not None:
1044 return self.url_result(mobj.group('url'))
796df3c6
S
1045
1046 # Look for embedded Viddler player
cb454b33
S
1047 mobj = re.search(
1048 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1049 webpage)
796df3c6
S
1050 if mobj is not None:
1051 return self.url_result(mobj.group('url'))
f25571ff 1052
3378d67a
S
1053 # Look for NYTimes player
1054 mobj = re.search(
1055 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1056 webpage)
1057 if mobj is not None:
1058 return self.url_result(mobj.group('url'))
1059
cefdf970
S
1060 # Look for Libsyn player
1061 mobj = re.search(
1062 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1063 if mobj is not None:
1064 return self.url_result(mobj.group('url'))
1065
c0d0b01f 1066 # Look for Ooyala videos
cb454b33 1067 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 1068 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1069 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
c0d0b01f 1070 if mobj is not None:
750f9020 1071 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 1072
f076b638 1073 # Look for multiple Ooyala embeds on SBN network websites
1074 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1075 if mobj is not None:
1076 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1077 if embeds:
1078 return _playlist_from_matches(
1079 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1080
aa94a6d3 1081 # Look for Aparat videos
48099643 1082 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1083 if mobj is not None:
1084 return self.url_result(mobj.group(1), 'Aparat')
1085
c93c2ab1 1086 # Look for MPORA videos
c3f51436 1087 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1088 if mobj is not None:
1089 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1090
15c0e8e7 1091 # Look for embedded NovaMov-based player
8f89e687 1092 mobj = re.search(
8dfa187b 1093 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1094 (?P<url>http://(?:(?:embed|www)\.)?
1095 (?:novamov\.com|
1096 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1097 videoweed\.(?:es|com)|
1098 movshare\.(?:net|sx|ag)|
1099 divxstage\.(?:eu|net|ch|co|at|ag))
1100 /embed\.php.+?)\1''', webpage)
8f89e687 1101 if mobj is not None:
15c0e8e7 1102 return self.url_result(mobj.group('url'))
50f56607 1103
9834872b
PH
1104 # Look for embedded Facebook player
1105 mobj = re.search(
db1f3888 1106 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1107 if mobj is not None:
1108 return self.url_result(mobj.group('url'), 'Facebook')
1109
ca97a56e
S
1110 # Look for embedded VK player
1111 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1112 if mobj is not None:
1113 return self.url_result(mobj.group('url'), 'VK')
1114
0364fa8b
S
1115 # Look for embedded ivi player
1116 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1117 if mobj is not None:
1118 return self.url_result(mobj.group('url'), 'Ivi')
1119
db1f3888
PH
1120 # Look for embedded Huffington Post player
1121 mobj = re.search(
c3f51436 1122 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1123 if mobj is not None:
1124 return self.url_result(mobj.group('url'), 'HuffPost')
1125
1b86cc41 1126 # Look for embed.ly
1127 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1128 if mobj is not None:
1129 return self.url_result(mobj.group('url'))
1130 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1131 if mobj is not None:
1132 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1133
60cc4dc4
PH
1134 # Look for funnyordie embed
1135 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1136 if matches:
ed2d6a19
PH
1137 return _playlist_from_matches(
1138 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1139
db546cf8
S
1140 # Look for BBC iPlayer embed
1141 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1142 if matches:
476eae0c 1143 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1144
93d020dd
S
1145 # Look for embedded RUTV player
1146 rutv_url = RUTVIE._extract_url(webpage)
1147 if rutv_url:
1148 return self.url_result(rutv_url, 'RUTV')
1149
7e2ede98
JMF
1150 # Look for embedded TED player
1151 mobj = re.search(
d7cc31b6 1152 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1153 if mobj is not None:
1154 return self.url_result(mobj.group('url'), 'TED')
1155
5c386252 1156 # Look for embedded Ustream videos
1157 mobj = re.search(
1158 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1159 if mobj is not None:
1160 return self.url_result(mobj.group('url'), 'Ustream')
1161
893f8832
PH
1162 # Look for embedded arte.tv player
1163 mobj = re.search(
1164 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1165 webpage)
1166 if mobj is not None:
1167 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1168
cb3ac1c6
S
1169 # Look for embedded smotri.com player
1170 smotri_url = SmotriIE._extract_url(webpage)
1171 if smotri_url:
1172 return self.url_result(smotri_url, 'Smotri')
1173
20991253
PH
1174 # Look for embeded soundcloud player
1175 mobj = re.search(
ac645ac7 1176 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1177 webpage)
1178 if mobj is not None:
1179 url = unescapeHTML(mobj.group('url'))
1180 return self.url_result(url)
1181
826ec77f
PH
1182 # Look for embedded vulture.com player
1183 mobj = re.search(
1184 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1185 webpage)
1186 if mobj is not None:
1187 url = unescapeHTML(mobj.group('url'))
1188 return self.url_result(url, ie='Vulture')
1189
c5cd249e
JMF
1190 # Look for embedded mtvservices player
1191 mobj = re.search(
1192 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1193 webpage)
1194 if mobj is not None:
1195 url = unescapeHTML(mobj.group('url'))
1196 return self.url_result(url, ie='MTVServicesEmbedded')
1197
49807b4a
S
1198 # Look for embedded yahoo player
1199 mobj = re.search(
1200 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1201 webpage)
1202 if mobj is not None:
1203 return self.url_result(mobj.group('url'), 'Yahoo')
1204
2ef6fcb5
PH
1205 # Look for embedded sbs.com.au player
1206 mobj = re.search(
e98b8e79
PH
1207 r'''(?x)
1208 (?:
1209 <meta\s+property="og:video"\s+content=|
1210 <iframe[^>]+?src=
1211 )
1212 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1213 webpage)
1214 if mobj is not None:
1215 return self.url_result(mobj.group('url'), 'SBS')
1216
42bdd9d0
PH
1217 # Look for embedded Cinchcast player
1218 mobj = re.search(
1219 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1220 webpage)
1221 if mobj is not None:
1222 return self.url_result(mobj.group('url'), 'Cinchcast')
1223
1a94ff68 1224 mobj = re.search(
5263cdfc 1225 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
1226 webpage)
1227 if mobj is not None:
1228 return self.url_result(mobj.group('url'), 'MLB')
1229
1419fafd
S
1230 mobj = re.search(
1231 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1232 webpage)
1233 if mobj is not None:
1234 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1235
af63fed7
PH
1236 mobj = re.search(
1237 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1238 webpage)
1239 if mobj is not None:
1240 return self.url_result(mobj.group('url'), 'Livestream')
1241
255fca5e
S
1242 # Look for Zapiks embed
1243 mobj = re.search(
1244 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1245 if mobj is not None:
1246 return self.url_result(mobj.group('url'), 'Zapiks')
1247
e3216b82
NJ
1248 # Look for Kaltura embeds
1249 mobj = re.search(
1250 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1251 if mobj is not None:
1252 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1253
135c9c42
S
1254 # Look for Eagle.Platform embeds
1255 mobj = re.search(
1256 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1257 if mobj is not None:
1258 return self.url_result(mobj.group('url'), 'EaglePlatform')
1259
d47ae7f6
S
1260 # Look for ClipYou (uses Eagle.Platform) embeds
1261 mobj = re.search(
1262 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1263 if mobj is not None:
1264 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1265
f8388757
S
1266 # Look for Pladform embeds
1267 mobj = re.search(
1268 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1269 if mobj is not None:
1270 return self.url_result(mobj.group('url'), 'Pladform')
1271
ad320e9b
NJ
1272 # Look for 5min embeds
1273 mobj = re.search(
1274 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1275 if mobj is not None:
1276 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1277
18153f1b
S
1278 # Look for Crooks and Liars embeds
1279 mobj = re.search(
1280 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1281 if mobj is not None:
1282 return self.url_result(mobj.group('url'))
1283
a2edf2e7
YCH
1284 # Look for NBC Sports VPlayer embeds
1285 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1286 if nbc_sports_url:
1287 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1288
418c5cc3
YCH
1289 # Look for UDN embeds
1290 mobj = re.search(
1291 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1292 if mobj is not None:
1293 return self.url_result(
0a160363 1294 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
418c5cc3 1295
ced659bb 1296 def check_video(vurl):
a0f71985
PH
1297 if YoutubeIE.suitable(vurl):
1298 return True
ced659bb
S
1299 vpath = compat_urlparse.urlparse(vurl).path
1300 vext = determine_ext(vpath)
1301 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1302
1303 def filter_video(urls):
1304 return list(filter(check_video, urls))
1305
9b122384 1306 # Start with something easy: JW Player in SWFObject
ced659bb 1307 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1308 if not found:
d981cef6 1309 # Look for gorilla-vid style embedding
ced659bb 1310 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1311 (?:
1312 jw_plugins|
1313 JWPlayerOptions|
1314 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1315 )
a0f71985
PH
1316 .*?
1317 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1318 if not found:
9b122384 1319 # Broaden the search a little bit
ced659bb 1320 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1321 if not found:
1322 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1323 found = filter_video(re.findall(
1324 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1325 if not found:
1326 # Flow player
ced659bb 1327 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1328 flowplayer\("[^"]+",\s*
1329 \{[^}]+?\}\s*,
52585fd6 1330 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1331 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1332 ''', webpage))
501f13fb
PH
1333 if not found:
1334 # Cinerama player
1335 found = re.findall(
1336 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1337 if not found:
9b122384 1338 # Try to find twitter cards info
ced659bb
S
1339 found = filter_video(re.findall(
1340 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1341 if not found:
9b122384
PH
1342 # We look for Open Graph info:
1343 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1344 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1345 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1346 if m_video_type is not None:
ced659bb 1347 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1348 if not found:
7fea7156 1349 # HTML5 video
9b32eca3 1350 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1351 if not found:
ed9a25dd 1352 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
a5a45015 1353 found = re.search(
89ef304b 1354 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
ed9a25dd 1355 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
89ef304b 1356 webpage)
84f81016
S
1357 if not found:
1358 # Look also in Refresh HTTP header
1359 refresh_header = head_response.headers.get('Refresh')
1360 if refresh_header:
ed9a25dd 1361 found = re.search(REDIRECT_REGEX, refresh_header)
b30b8698
PH
1362 if found:
1363 new_url = found.group(1)
89ef304b
PH
1364 self.report_following_redirect(new_url)
1365 return {
1366 '_type': 'url',
1367 'url': new_url,
1368 }
b30b8698 1369 if not found:
416c7fcb 1370 raise UnsupportedError(url)
9b122384 1371
b30b8698
PH
1372 entries = []
1373 for video_url in found:
1374 video_url = compat_urlparse.urljoin(url, video_url)
1375 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1376
b30b8698
PH
1377 # Sometimes, jwplayer extraction will result in a YouTube URL
1378 if YoutubeIE.suitable(video_url):
1379 entries.append(self.url_result(video_url, 'Youtube'))
1380 continue
9b122384 1381
b30b8698
PH
1382 # here's a fun little line of code for you:
1383 video_id = os.path.splitext(video_id)[0]
fc9713a1 1384
b30b8698
PH
1385 entries.append({
1386 'id': video_id,
1387 'url': video_url,
1388 'uploader': video_uploader,
1389 'title': video_title,
4d805e06 1390 'age_limit': age_limit,
b30b8698
PH
1391 })
1392
1393 if len(entries) == 1:
669f0e7c 1394 return entries[0]
b30b8698
PH
1395 else:
1396 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1397 # 'url' results don't have a title
1398 if e.get('title') is not None:
1399 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1400 return {
1401 '_type': 'playlist',
1402 'entries': entries,
1403 }