]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/generic.py
[extractor/generic] Add test for tvc embed
[yt-dlp.git] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11 compat_urllib_parse,
12 compat_urllib_parse_unquote,
13 compat_urllib_request,
14 compat_urlparse,
15 compat_xml_parse_error,
16 )
17 from ..utils import (
18 determine_ext,
19 ExtractorError,
20 float_or_none,
21 HEADRequest,
22 is_html,
23 orderedSet,
24 parse_xml,
25 smuggle_url,
26 unescapeHTML,
27 unified_strdate,
28 unsmuggle_url,
29 UnsupportedError,
30 url_basename,
31 xpath_text,
32 )
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .tvc import TVCIE
38 from .sportbox import SportBoxEmbedIE
39 from .smotri import SmotriIE
40 from .condenast import CondeNastIE
41 from .udn import UDNEmbedIE
42 from .senateisvp import SenateISVPIE
43 from .bliptv import BlipTVIE
44 from .svt import SVTIE
45
46
47 class GenericIE(InfoExtractor):
48 IE_DESC = 'Generic downloader that works on some sites'
49 _VALID_URL = r'.*'
50 IE_NAME = 'generic'
51 _TESTS = [
52 # Direct link to a video
53 {
54 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
55 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
56 'info_dict': {
57 'id': 'trailer',
58 'ext': 'mp4',
59 'title': 'trailer',
60 'upload_date': '20100513',
61 }
62 },
63 # Direct link to media delivered compressed (until Accept-Encoding is *)
64 {
65 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
66 'md5': '128c42e68b13950268b648275386fc74',
67 'info_dict': {
68 'id': 'FictionJunction-Parallel_Hearts',
69 'ext': 'flac',
70 'title': 'FictionJunction-Parallel_Hearts',
71 'upload_date': '20140522',
72 },
73 'expected_warnings': [
74 'URL could be a direct video link, returning it as such.'
75 ]
76 },
77 # Direct download with broken HEAD
78 {
79 'url': 'http://ai-radio.org:8000/radio.opus',
80 'info_dict': {
81 'id': 'radio',
82 'ext': 'opus',
83 'title': 'radio',
84 },
85 'params': {
86 'skip_download': True, # infinite live stream
87 },
88 'expected_warnings': [
89 r'501.*Not Implemented'
90 ],
91 },
92 # Direct link with incorrect MIME type
93 {
94 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
95 'md5': '4ccbebe5f36706d85221f204d7eb5913',
96 'info_dict': {
97 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
98 'id': '5_Lennart_Poettering_-_Systemd',
99 'ext': 'webm',
100 'title': '5_Lennart_Poettering_-_Systemd',
101 'upload_date': '20141120',
102 },
103 'expected_warnings': [
104 'URL could be a direct video link, returning it as such.'
105 ]
106 },
107 # RSS feed
108 {
109 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
110 'info_dict': {
111 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
112 'title': 'Zero Punctuation',
113 'description': 're:.*groundbreaking video review series.*'
114 },
115 'playlist_mincount': 11,
116 },
117 # RSS feed with enclosure
118 {
119 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
120 'info_dict': {
121 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
122 'ext': 'm4v',
123 'upload_date': '20150228',
124 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
125 }
126 },
127 # google redirect
128 {
129 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
130 'info_dict': {
131 'id': 'cmQHVoWB5FY',
132 'ext': 'mp4',
133 'upload_date': '20130224',
134 'uploader_id': 'TheVerge',
135 'description': 're:^Chris Ziegler takes a look at the\.*',
136 'uploader': 'The Verge',
137 'title': 'First Firefox OS phones side-by-side',
138 },
139 'params': {
140 'skip_download': False,
141 }
142 },
143 {
144 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
145 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
146 'info_dict': {
147 'id': '13601338388002',
148 'ext': 'mp4',
149 'uploader': 'www.hodiho.fr',
150 'title': 'R\u00e9gis plante sa Jeep',
151 }
152 },
153 # bandcamp page with custom domain
154 {
155 'add_ie': ['Bandcamp'],
156 'url': 'http://bronyrock.com/track/the-pony-mash',
157 'info_dict': {
158 'id': '3235767654',
159 'ext': 'mp3',
160 'title': 'The Pony Mash',
161 'uploader': 'M_Pallante',
162 },
163 'skip': 'There is a limit of 200 free downloads / month for the test song',
164 },
165 # embedded brightcove video
166 # it also tests brightcove videos that need to set the 'Referer' in the
167 # http requests
168 {
169 'add_ie': ['Brightcove'],
170 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
171 'info_dict': {
172 'id': '2765128793001',
173 'ext': 'mp4',
174 'title': 'Le cours de bourse : l’analyse technique',
175 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
176 'uploader': 'BFM BUSINESS',
177 },
178 'params': {
179 'skip_download': True,
180 },
181 },
182 {
183 # https://github.com/rg3/youtube-dl/issues/2253
184 'url': 'http://bcove.me/i6nfkrc3',
185 'md5': '0ba9446db037002366bab3b3eb30c88c',
186 'info_dict': {
187 'id': '3101154703001',
188 'ext': 'mp4',
189 'title': 'Still no power',
190 'uploader': 'thestar.com',
191 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
192 },
193 'add_ie': ['Brightcove'],
194 },
195 {
196 'url': 'http://www.championat.com/video/football/v/87/87499.html',
197 'md5': 'fb973ecf6e4a78a67453647444222983',
198 'info_dict': {
199 'id': '3414141473001',
200 'ext': 'mp4',
201 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
202 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
203 'uploader': 'Championat',
204 },
205 },
206 {
207 # https://github.com/rg3/youtube-dl/issues/3541
208 'add_ie': ['Brightcove'],
209 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
210 'info_dict': {
211 'id': '3866516442001',
212 'ext': 'mp4',
213 'title': 'Leer mij vrouwen kennen: Aflevering 1',
214 'description': 'Leer mij vrouwen kennen: Aflevering 1',
215 'uploader': 'SBS Broadcasting',
216 },
217 'skip': 'Restricted to Netherlands',
218 'params': {
219 'skip_download': True, # m3u8 download
220 },
221 },
222 # ooyala video
223 {
224 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
225 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
226 'info_dict': {
227 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
228 'ext': 'mp4',
229 'title': '2cc213299525360.mov', # that's what we get
230 },
231 'add_ie': ['Ooyala'],
232 },
233 # multiple ooyala embeds on SBN network websites
234 {
235 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
236 'info_dict': {
237 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
238 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
239 },
240 'playlist_mincount': 3,
241 'params': {
242 'skip_download': True,
243 },
244 'add_ie': ['Ooyala'],
245 },
246 # embed.ly video
247 {
248 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
249 'info_dict': {
250 'id': '9ODmcdjQcHQ',
251 'ext': 'mp4',
252 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
253 'upload_date': '20140225',
254 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
255 'uploader': 'Tested',
256 'uploader_id': 'testedcom',
257 },
258 # No need to test YoutubeIE here
259 'params': {
260 'skip_download': True,
261 },
262 },
263 # funnyordie embed
264 {
265 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
266 'info_dict': {
267 'id': '18e820ec3f',
268 'ext': 'mp4',
269 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
270 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
271 },
272 },
273 # BBC iPlayer embeds
274 {
275 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
276 'info_dict': {
277 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
278 },
279 'playlist_mincount': 18,
280 },
281 # RUTV embed
282 {
283 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
284 'info_dict': {
285 'id': '776940',
286 'ext': 'mp4',
287 'title': 'Охотское море стало целиком российским',
288 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
289 },
290 'params': {
291 # m3u8 download
292 'skip_download': True,
293 },
294 },
295 # TVC embed
296 {
297 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
298 'info_dict': {
299 'id': '55304',
300 'ext': 'mp4',
301 'title': 'Дошкольное воспитание',
302 },
303 },
304 # SportBox embed
305 {
306 'url': 'http://www.vestifinance.ru/articles/25753',
307 'info_dict': {
308 'id': '25753',
309 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
310 },
311 'playlist': [{
312 'info_dict': {
313 'id': '370908',
314 'title': 'Госзаказ. День 3',
315 'ext': 'mp4',
316 }
317 }, {
318 'info_dict': {
319 'id': '370905',
320 'title': 'Госзаказ. День 2',
321 'ext': 'mp4',
322 }
323 }, {
324 'info_dict': {
325 'id': '370902',
326 'title': 'Госзаказ. День 1',
327 'ext': 'mp4',
328 }
329 }],
330 'params': {
331 # m3u8 download
332 'skip_download': True,
333 },
334 },
335 # Embedded TED video
336 {
337 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
338 'md5': '65fdff94098e4a607385a60c5177c638',
339 'info_dict': {
340 'id': '1969',
341 'ext': 'mp4',
342 'title': 'Hidden miracles of the natural world',
343 'uploader': 'Louie Schwartzberg',
344 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
345 }
346 },
347 # Embeded Ustream video
348 {
349 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
350 'md5': '27b99cdb639c9b12a79bca876a073417',
351 'info_dict': {
352 'id': '45734260',
353 'ext': 'flv',
354 'uploader': 'AU SPA: The NSA and Privacy',
355 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
356 }
357 },
358 # nowvideo embed hidden behind percent encoding
359 {
360 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
361 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
362 'info_dict': {
363 'id': '06e53103ca9aa',
364 'ext': 'flv',
365 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
366 'description': 'No description',
367 },
368 },
369 # arte embed
370 {
371 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
372 'md5': '7653032cbb25bf6c80d80f217055fa43',
373 'info_dict': {
374 'id': '048195-004_PLUS7-F',
375 'ext': 'flv',
376 'title': 'X:enius',
377 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
378 'upload_date': '20140320',
379 },
380 'params': {
381 'skip_download': 'Requires rtmpdump'
382 }
383 },
384 # Condé Nast embed
385 {
386 'url': 'http://www.wired.com/2014/04/honda-asimo/',
387 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
388 'info_dict': {
389 'id': '53501be369702d3275860000',
390 'ext': 'mp4',
391 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
392 }
393 },
394 # Dailymotion embed
395 {
396 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
397 'md5': '441aeeb82eb72c422c7f14ec533999cd',
398 'info_dict': {
399 'id': 'k2mm4bCdJ6CQ2i7c8o2',
400 'ext': 'mp4',
401 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
402 'uploader': 'Spi0n',
403 },
404 'add_ie': ['Dailymotion'],
405 },
406 # YouTube embed
407 {
408 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
409 'info_dict': {
410 'id': 'FXRb4ykk4S0',
411 'ext': 'mp4',
412 'title': 'The NBL Auction 2014',
413 'uploader': 'BADMINTON England',
414 'uploader_id': 'BADMINTONEvents',
415 'upload_date': '20140603',
416 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
417 },
418 'add_ie': ['Youtube'],
419 'params': {
420 'skip_download': True,
421 }
422 },
423 # MTVSercices embed
424 {
425 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
426 'md5': '35727f82f58c76d996fc188f9755b0d5',
427 'info_dict': {
428 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
429 'ext': 'mp4',
430 'title': 'Review',
431 'description': 'Mario\'s life in the fast lane has never looked so good.',
432 },
433 },
434 # YouTube embed via <data-embed-url="">
435 {
436 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
437 'info_dict': {
438 'id': '4vAffPZIT44',
439 'ext': 'mp4',
440 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
441 'uploader': 'Gameloft',
442 'uploader_id': 'gameloft',
443 'upload_date': '20140828',
444 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
445 },
446 'params': {
447 'skip_download': True,
448 }
449 },
450 # Camtasia studio
451 {
452 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
453 'playlist': [{
454 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
455 'info_dict': {
456 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
457 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
458 'ext': 'flv',
459 'duration': 2235.90,
460 }
461 }, {
462 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
463 'info_dict': {
464 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
465 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
466 'ext': 'flv',
467 'duration': 2235.93,
468 }
469 }],
470 'info_dict': {
471 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
472 }
473 },
474 # Flowplayer
475 {
476 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
477 'md5': '9d65602bf31c6e20014319c7d07fba27',
478 'info_dict': {
479 'id': '5123ea6d5e5a7',
480 'ext': 'mp4',
481 'age_limit': 18,
482 'uploader': 'www.handjobhub.com',
483 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
484 }
485 },
486 # Multiple brightcove videos
487 # https://github.com/rg3/youtube-dl/issues/2283
488 {
489 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
490 'info_dict': {
491 'id': 'always-never',
492 'title': 'Always / Never - The New Yorker',
493 },
494 'playlist_count': 3,
495 'params': {
496 'extract_flat': False,
497 'skip_download': True,
498 }
499 },
500 # MLB embed
501 {
502 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
503 'md5': '96f09a37e44da40dd083e12d9a683327',
504 'info_dict': {
505 'id': '33322633',
506 'ext': 'mp4',
507 'title': 'Ump changes call to ball',
508 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
509 'duration': 48,
510 'timestamp': 1401537900,
511 'upload_date': '20140531',
512 'thumbnail': 're:^https?://.*\.jpg$',
513 },
514 },
515 # Wistia embed
516 {
517 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
518 'md5': '8788b683c777a5cf25621eaf286d0c23',
519 'info_dict': {
520 'id': '1cfaf6b7ea',
521 'ext': 'mov',
522 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
523 'duration': 643.0,
524 'filesize': 182808282,
525 'uploader': 'education-portal.com',
526 },
527 },
528 {
529 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
530 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
531 'info_dict': {
532 'id': 'uxjb0lwrcz',
533 'ext': 'mp4',
534 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
535 'duration': 1715.0,
536 'uploader': 'thoughtworks.wistia.com',
537 },
538 },
539 # Soundcloud embed
540 {
541 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
542 'info_dict': {
543 'id': '174391317',
544 'ext': 'mp3',
545 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
546 'uploader': 'Sophos Security',
547 'title': 'Chet Chat 171 - Oct 29, 2014',
548 'upload_date': '20141029',
549 }
550 },
551 # Livestream embed
552 {
553 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
554 'info_dict': {
555 'id': '67864563',
556 'ext': 'flv',
557 'upload_date': '20141112',
558 'title': 'Rosetta #CometLanding webcast HL 10',
559 }
560 },
561 # LazyYT
562 {
563 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
564 'info_dict': {
565 'id': '1986',
566 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
567 },
568 'playlist_mincount': 2,
569 },
570 # Cinchcast embed
571 {
572 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
573 'info_dict': {
574 'id': '7141703',
575 'ext': 'mp3',
576 'upload_date': '20141126',
577 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
578 }
579 },
580 # Cinerama player
581 {
582 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
583 'info_dict': {
584 'id': '730m_DandD_1901_512k',
585 'ext': 'mp4',
586 'uploader': 'www.abc.net.au',
587 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
588 }
589 },
590 # embedded viddler video
591 {
592 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
593 'info_dict': {
594 'id': '4d03aad9',
595 'ext': 'mp4',
596 'uploader': 'deadspin',
597 'title': 'WALL-TO-GORTAT',
598 'timestamp': 1422285291,
599 'upload_date': '20150126',
600 },
601 'add_ie': ['Viddler'],
602 },
603 # Libsyn embed
604 {
605 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
606 'info_dict': {
607 'id': '3377616',
608 'ext': 'mp3',
609 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
610 'description': 'md5:601cb790edd05908957dae8aaa866465',
611 'upload_date': '20150220',
612 },
613 },
614 # jwplayer YouTube
615 {
616 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
617 'info_dict': {
618 'id': 'Mrj4DVp2zeA',
619 'ext': 'mp4',
620 'upload_date': '20150212',
621 'uploader': 'The National Archives UK',
622 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
623 'uploader_id': 'NationalArchives08',
624 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
625 },
626 },
627 # rtl.nl embed
628 {
629 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
630 'playlist_mincount': 5,
631 'info_dict': {
632 'id': 'aanslagen-kopenhagen',
633 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
634 }
635 },
636 # Zapiks embed
637 {
638 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
639 'info_dict': {
640 'id': '118046',
641 'ext': 'mp4',
642 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
643 }
644 },
645 # Kaltura embed
646 {
647 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
648 'info_dict': {
649 'id': '1_eergr3h1',
650 'ext': 'mp4',
651 'upload_date': '20150226',
652 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
653 'timestamp': int,
654 'title': 'John Carlson Postgame 2/25/15',
655 },
656 },
657 # Eagle.Platform embed (generic URL)
658 {
659 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
660 'info_dict': {
661 'id': '227304',
662 'ext': 'mp4',
663 'title': 'Навальный вышел на свободу',
664 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
665 'thumbnail': 're:^https?://.*\.jpg$',
666 'duration': 87,
667 'view_count': int,
668 'age_limit': 0,
669 },
670 },
671 # ClipYou (Eagle.Platform) embed (custom URL)
672 {
673 'url': 'http://muz-tv.ru/play/7129/',
674 'info_dict': {
675 'id': '12820',
676 'ext': 'mp4',
677 'title': "'O Sole Mio",
678 'thumbnail': 're:^https?://.*\.jpg$',
679 'duration': 216,
680 'view_count': int,
681 },
682 },
683 # Pladform embed
684 {
685 'url': 'http://muz-tv.ru/kinozal/view/7400/',
686 'info_dict': {
687 'id': '100183293',
688 'ext': 'mp4',
689 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
690 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
691 'thumbnail': 're:^https?://.*\.jpg$',
692 'duration': 694,
693 'age_limit': 0,
694 },
695 },
696 # Playwire embed
697 {
698 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
699 'info_dict': {
700 'id': '3519514',
701 'ext': 'mp4',
702 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
703 'thumbnail': 're:^https?://.*\.png$',
704 'duration': 45.115,
705 },
706 },
707 # 5min embed
708 {
709 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
710 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
711 'info_dict': {
712 'id': '518726732',
713 'ext': 'mp4',
714 'title': 'Facebook Creates "On This Day" | Crunch Report',
715 },
716 },
717 # SVT embed
718 {
719 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
720 'info_dict': {
721 'id': '2900353',
722 'ext': 'flv',
723 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
724 'duration': 27,
725 'age_limit': 0,
726 },
727 },
728 # Crooks and Liars embed
729 {
730 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
731 'info_dict': {
732 'id': '8RUoRhRi',
733 'ext': 'mp4',
734 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
735 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
736 'timestamp': 1428207000,
737 'upload_date': '20150405',
738 'uploader': 'Heather',
739 },
740 },
741 # Crooks and Liars external embed
742 {
743 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
744 'info_dict': {
745 'id': 'MTE3MjUtMzQ2MzA',
746 'ext': 'mp4',
747 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
748 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
749 'timestamp': 1265032391,
750 'upload_date': '20100201',
751 'uploader': 'Heather',
752 },
753 },
754 # NBC Sports vplayer embed
755 {
756 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
757 'info_dict': {
758 'id': 'ln7x1qSThw4k',
759 'ext': 'flv',
760 'title': "PFT Live: New leader in the 'new-look' defense",
761 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
762 },
763 },
764 # UDN embed
765 {
766 'url': 'http://www.udn.com/news/story/7314/822787',
767 'md5': 'fd2060e988c326991037b9aff9df21a6',
768 'info_dict': {
769 'id': '300346',
770 'ext': 'mp4',
771 'title': '中一中男師變性 全校師生力挺',
772 'thumbnail': 're:^https?://.*\.jpg$',
773 }
774 },
775 # Ooyala embed
776 {
777 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
778 'info_dict': {
779 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
780 'ext': 'mp4',
781 'description': 'VIDEO: Index/Match versus VLOOKUP.',
782 'title': 'This is what separates the Excel masters from the wannabes',
783 },
784 'params': {
785 # m3u8 downloads
786 'skip_download': True,
787 }
788 },
789 # Contains a SMIL manifest
790 {
791 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
792 'info_dict': {
793 'id': 'file',
794 'ext': 'flv',
795 'title': '+ Football: Lottery Champions League Europe',
796 'uploader': 'www.telewebion.com',
797 },
798 'params': {
799 # rtmpe downloads
800 'skip_download': True,
801 }
802 },
803 # Brightcove URL in single quotes
804 {
805 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
806 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
807 'info_dict': {
808 'id': '4255764656001',
809 'ext': 'mp4',
810 'title': 'SN Presents: Russell Martin, World Citizen',
811 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
812 'uploader': 'Rogers Sportsnet',
813 },
814 }
815 ]
816
817 def report_following_redirect(self, new_url):
818 """Report information extraction."""
819 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
820
821 def _extract_rss(self, url, video_id, doc):
822 playlist_title = doc.find('./channel/title').text
823 playlist_desc_el = doc.find('./channel/description')
824 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
825
826 entries = []
827 for it in doc.findall('./channel/item'):
828 next_url = xpath_text(it, 'link', fatal=False)
829 if not next_url:
830 enclosure_nodes = it.findall('./enclosure')
831 for e in enclosure_nodes:
832 next_url = e.attrib.get('url')
833 if next_url:
834 break
835
836 if not next_url:
837 continue
838
839 entries.append({
840 '_type': 'url',
841 'url': next_url,
842 'title': it.find('title').text,
843 })
844
845 return {
846 '_type': 'playlist',
847 'id': url,
848 'title': playlist_title,
849 'description': playlist_desc,
850 'entries': entries,
851 }
852
853 def _extract_camtasia(self, url, video_id, webpage):
854 """ Returns None if no camtasia video can be found. """
855
856 camtasia_cfg = self._search_regex(
857 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
858 webpage, 'camtasia configuration file', default=None)
859 if camtasia_cfg is None:
860 return None
861
862 title = self._html_search_meta('DC.title', webpage, fatal=True)
863
864 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
865 camtasia_cfg = self._download_xml(
866 camtasia_url, video_id,
867 note='Downloading camtasia configuration',
868 errnote='Failed to download camtasia configuration')
869 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
870
871 entries = []
872 for n in fileset_node.getchildren():
873 url_n = n.find('./uri')
874 if url_n is None:
875 continue
876
877 entries.append({
878 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
879 'title': '%s - %s' % (title, n.tag),
880 'url': compat_urlparse.urljoin(url, url_n.text),
881 'duration': float_or_none(n.find('./duration').text),
882 })
883
884 return {
885 '_type': 'playlist',
886 'entries': entries,
887 'title': title,
888 }
889
890 def _real_extract(self, url):
891 if url.startswith('//'):
892 return {
893 '_type': 'url',
894 'url': self.http_scheme() + url,
895 }
896
897 parsed_url = compat_urlparse.urlparse(url)
898 if not parsed_url.scheme:
899 default_search = self._downloader.params.get('default_search')
900 if default_search is None:
901 default_search = 'fixup_error'
902
903 if default_search in ('auto', 'auto_warning', 'fixup_error'):
904 if '/' in url:
905 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
906 return self.url_result('http://' + url)
907 elif default_search != 'fixup_error':
908 if default_search == 'auto_warning':
909 if re.match(r'^(?:url|URL)$', url):
910 raise ExtractorError(
911 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
912 expected=True)
913 else:
914 self._downloader.report_warning(
915 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
916 return self.url_result('ytsearch:' + url)
917
918 if default_search in ('error', 'fixup_error'):
919 raise ExtractorError(
920 '%r is not a valid URL. '
921 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
922 % (url, url), expected=True)
923 else:
924 if ':' not in default_search:
925 default_search += ':'
926 return self.url_result(default_search + url)
927
928 url, smuggled_data = unsmuggle_url(url)
929 force_videoid = None
930 is_intentional = smuggled_data and smuggled_data.get('to_generic')
931 if smuggled_data and 'force_videoid' in smuggled_data:
932 force_videoid = smuggled_data['force_videoid']
933 video_id = force_videoid
934 else:
935 video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
936
937 self.to_screen('%s: Requesting header' % video_id)
938
939 head_req = HEADRequest(url)
940 head_response = self._request_webpage(
941 head_req, video_id,
942 note=False, errnote='Could not send HEAD request to %s' % url,
943 fatal=False)
944
945 if head_response is not False:
946 # Check for redirect
947 new_url = head_response.geturl()
948 if url != new_url:
949 self.report_following_redirect(new_url)
950 if force_videoid:
951 new_url = smuggle_url(
952 new_url, {'force_videoid': force_videoid})
953 return self.url_result(new_url)
954
955 full_response = None
956 if head_response is False:
957 request = compat_urllib_request.Request(url)
958 request.add_header('Accept-Encoding', '*')
959 full_response = self._request_webpage(request, video_id)
960 head_response = full_response
961
962 # Check for direct link to a video
963 content_type = head_response.headers.get('Content-Type', '')
964 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
965 if m:
966 upload_date = unified_strdate(
967 head_response.headers.get('Last-Modified'))
968 return {
969 'id': video_id,
970 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
971 'direct': True,
972 'formats': [{
973 'format_id': m.group('format_id'),
974 'url': url,
975 'vcodec': 'none' if m.group('type') == 'audio' else None
976 }],
977 'upload_date': upload_date,
978 }
979
980 if not self._downloader.params.get('test', False) and not is_intentional:
981 self._downloader.report_warning('Falling back on generic information extractor.')
982
983 if not full_response:
984 request = compat_urllib_request.Request(url)
985 # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
986 # making it impossible to download only chunk of the file (yet we need only 512kB to
987 # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
988 # that will always result in downloading the whole file that is not desirable.
989 # Therefore for extraction pass we have to override Accept-Encoding to any in order
990 # to accept raw bytes and being able to download only a chunk.
991 # It may probably better to solve this by checking Content-Type for application/octet-stream
992 # after HEAD request finishes, but not sure if we can rely on this.
993 request.add_header('Accept-Encoding', '*')
994 full_response = self._request_webpage(request, video_id)
995
996 # Maybe it's a direct link to a video?
997 # Be careful not to download the whole thing!
998 first_bytes = full_response.read(512)
999 if not is_html(first_bytes):
1000 self._downloader.report_warning(
1001 'URL could be a direct video link, returning it as such.')
1002 upload_date = unified_strdate(
1003 head_response.headers.get('Last-Modified'))
1004 return {
1005 'id': video_id,
1006 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1007 'direct': True,
1008 'url': url,
1009 'upload_date': upload_date,
1010 }
1011
1012 webpage = self._webpage_read_content(
1013 full_response, url, video_id, prefix=first_bytes)
1014
1015 self.report_extraction(video_id)
1016
1017 # Is it an RSS feed?
1018 try:
1019 doc = parse_xml(webpage)
1020 if doc.tag == 'rss':
1021 return self._extract_rss(url, video_id, doc)
1022 except compat_xml_parse_error:
1023 pass
1024
1025 # Is it a Camtasia project?
1026 camtasia_res = self._extract_camtasia(url, video_id, webpage)
1027 if camtasia_res is not None:
1028 return camtasia_res
1029
1030 # Sometimes embedded video player is hidden behind percent encoding
1031 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1032 # Unescaping the whole page allows to handle those cases in a generic way
1033 webpage = compat_urllib_parse.unquote(webpage)
1034
1035 # it's tempting to parse this further, but you would
1036 # have to take into account all the variations like
1037 # Video Title - Site Name
1038 # Site Name | Video Title
1039 # Video Title - Tagline | Site Name
1040 # and so on and so forth; it's just not practical
1041 video_title = self._html_search_regex(
1042 r'(?s)<title>(.*?)</title>', webpage, 'video title',
1043 default='video')
1044
1045 # Try to detect age limit automatically
1046 age_limit = self._rta_search(webpage)
1047 # And then there are the jokers who advertise that they use RTA,
1048 # but actually don't.
1049 AGE_LIMIT_MARKERS = [
1050 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1051 ]
1052 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1053 age_limit = 18
1054
1055 # video uploader is domain name
1056 video_uploader = self._search_regex(
1057 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1058
1059 # Helper method
1060 def _playlist_from_matches(matches, getter=None, ie=None):
1061 urlrs = orderedSet(
1062 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1063 for m in matches)
1064 return self.playlist_result(
1065 urlrs, playlist_id=video_id, playlist_title=video_title)
1066
1067 # Look for BrightCove:
1068 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1069 if bc_urls:
1070 self.to_screen('Brightcove video detected.')
1071 entries = [{
1072 '_type': 'url',
1073 'url': smuggle_url(bc_url, {'Referer': url}),
1074 'ie_key': 'Brightcove'
1075 } for bc_url in bc_urls]
1076
1077 return {
1078 '_type': 'playlist',
1079 'title': video_title,
1080 'id': video_id,
1081 'entries': entries,
1082 }
1083
1084 # Look for embedded rtl.nl player
1085 matches = re.findall(
1086 r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1087 webpage)
1088 if matches:
1089 return _playlist_from_matches(matches, ie='RtlNl')
1090
1091 # Look for embedded (iframe) Vimeo player
1092 mobj = re.search(
1093 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1094 if mobj:
1095 player_url = unescapeHTML(mobj.group('url'))
1096 surl = smuggle_url(player_url, {'Referer': url})
1097 return self.url_result(surl)
1098 # Look for embedded (swf embed) Vimeo player
1099 mobj = re.search(
1100 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1101 if mobj:
1102 return self.url_result(mobj.group(1))
1103
1104 # Look for embedded YouTube player
1105 matches = re.findall(r'''(?x)
1106 (?:
1107 <iframe[^>]+?src=|
1108 data-video-url=|
1109 <embed[^>]+?src=|
1110 embedSWF\(?:\s*|
1111 new\s+SWFObject\(
1112 )
1113 (["\'])
1114 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1115 (?:embed|v|p)/.+?)
1116 \1''', webpage)
1117 if matches:
1118 return _playlist_from_matches(
1119 matches, lambda m: unescapeHTML(m[1]))
1120
1121 # Look for lazyYT YouTube embed
1122 matches = re.findall(
1123 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1124 if matches:
1125 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1126
1127 # Look for embedded Dailymotion player
1128 matches = re.findall(
1129 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1130 if matches:
1131 return _playlist_from_matches(
1132 matches, lambda m: unescapeHTML(m[1]))
1133
1134 # Look for embedded Dailymotion playlist player (#3822)
1135 m = re.search(
1136 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1137 if m:
1138 playlists = re.findall(
1139 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1140 if playlists:
1141 return _playlist_from_matches(
1142 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1143
1144 # Look for embedded Wistia player
1145 match = re.search(
1146 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1147 if match:
1148 embed_url = self._proto_relative_url(
1149 unescapeHTML(match.group('url')))
1150 return {
1151 '_type': 'url_transparent',
1152 'url': embed_url,
1153 'ie_key': 'Wistia',
1154 'uploader': video_uploader,
1155 'title': video_title,
1156 'id': video_id,
1157 }
1158
1159 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1160 if match:
1161 return {
1162 '_type': 'url_transparent',
1163 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1164 'ie_key': 'Wistia',
1165 'uploader': video_uploader,
1166 'title': video_title,
1167 'id': match.group('id')
1168 }
1169
1170 # Look for embedded blip.tv player
1171 bliptv_url = BlipTVIE._extract_url(webpage)
1172 if bliptv_url:
1173 return self.url_result(bliptv_url, 'BlipTV')
1174
1175 # Look for SVT player
1176 svt_url = SVTIE._extract_url(webpage)
1177 if svt_url:
1178 return self.url_result(svt_url, 'SVT')
1179
1180 # Look for embedded condenast player
1181 matches = re.findall(
1182 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1183 webpage)
1184 if matches:
1185 return {
1186 '_type': 'playlist',
1187 'entries': [{
1188 '_type': 'url',
1189 'ie_key': 'CondeNast',
1190 'url': ma,
1191 } for ma in matches],
1192 'title': video_title,
1193 'id': video_id,
1194 }
1195
1196 # Look for Bandcamp pages with custom domain
1197 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1198 if mobj is not None:
1199 burl = unescapeHTML(mobj.group(1))
1200 # Don't set the extractor because it can be a track url or an album
1201 return self.url_result(burl)
1202
1203 # Look for embedded Vevo player
1204 mobj = re.search(
1205 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1206 if mobj is not None:
1207 return self.url_result(mobj.group('url'))
1208
1209 # Look for embedded Viddler player
1210 mobj = re.search(
1211 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1212 webpage)
1213 if mobj is not None:
1214 return self.url_result(mobj.group('url'))
1215
1216 # Look for NYTimes player
1217 mobj = re.search(
1218 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1219 webpage)
1220 if mobj is not None:
1221 return self.url_result(mobj.group('url'))
1222
1223 # Look for Libsyn player
1224 mobj = re.search(
1225 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1226 if mobj is not None:
1227 return self.url_result(mobj.group('url'))
1228
1229 # Look for Ooyala videos
1230 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1231 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1232 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1233 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1234 if mobj is not None:
1235 return OoyalaIE._build_url_result(mobj.group('ec'))
1236
1237 # Look for multiple Ooyala embeds on SBN network websites
1238 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1239 if mobj is not None:
1240 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1241 if embeds:
1242 return _playlist_from_matches(
1243 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1244
1245 # Look for Aparat videos
1246 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1247 if mobj is not None:
1248 return self.url_result(mobj.group(1), 'Aparat')
1249
1250 # Look for MPORA videos
1251 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1252 if mobj is not None:
1253 return self.url_result(mobj.group(1), 'Mpora')
1254
1255 # Look for embedded NovaMov-based player
1256 mobj = re.search(
1257 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1258 (?P<url>http://(?:(?:embed|www)\.)?
1259 (?:novamov\.com|
1260 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1261 videoweed\.(?:es|com)|
1262 movshare\.(?:net|sx|ag)|
1263 divxstage\.(?:eu|net|ch|co|at|ag))
1264 /embed\.php.+?)\1''', webpage)
1265 if mobj is not None:
1266 return self.url_result(mobj.group('url'))
1267
1268 # Look for embedded Facebook player
1269 mobj = re.search(
1270 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1271 if mobj is not None:
1272 return self.url_result(mobj.group('url'), 'Facebook')
1273
1274 # Look for embedded VK player
1275 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1276 if mobj is not None:
1277 return self.url_result(mobj.group('url'), 'VK')
1278
1279 # Look for embedded ivi player
1280 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1281 if mobj is not None:
1282 return self.url_result(mobj.group('url'), 'Ivi')
1283
1284 # Look for embedded Huffington Post player
1285 mobj = re.search(
1286 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1287 if mobj is not None:
1288 return self.url_result(mobj.group('url'), 'HuffPost')
1289
1290 # Look for embed.ly
1291 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1292 if mobj is not None:
1293 return self.url_result(mobj.group('url'))
1294 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1295 if mobj is not None:
1296 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1297
1298 # Look for funnyordie embed
1299 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1300 if matches:
1301 return _playlist_from_matches(
1302 matches, getter=unescapeHTML, ie='FunnyOrDie')
1303
1304 # Look for BBC iPlayer embed
1305 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1306 if matches:
1307 return _playlist_from_matches(matches, ie='BBCCoUk')
1308
1309 # Look for embedded RUTV player
1310 rutv_url = RUTVIE._extract_url(webpage)
1311 if rutv_url:
1312 return self.url_result(rutv_url, 'RUTV')
1313
1314 # Look for embedded TVC player
1315 rutv_url = TVCIE._extract_url(webpage)
1316 if rutv_url:
1317 return self.url_result(rutv_url, 'TVC')
1318
1319 # Look for embedded SportBox player
1320 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1321 if sportbox_urls:
1322 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1323
1324 # Look for embedded TED player
1325 mobj = re.search(
1326 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1327 if mobj is not None:
1328 return self.url_result(mobj.group('url'), 'TED')
1329
1330 # Look for embedded Ustream videos
1331 mobj = re.search(
1332 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1333 if mobj is not None:
1334 return self.url_result(mobj.group('url'), 'Ustream')
1335
1336 # Look for embedded arte.tv player
1337 mobj = re.search(
1338 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1339 webpage)
1340 if mobj is not None:
1341 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1342
1343 # Look for embedded smotri.com player
1344 smotri_url = SmotriIE._extract_url(webpage)
1345 if smotri_url:
1346 return self.url_result(smotri_url, 'Smotri')
1347
1348 # Look for embeded soundcloud player
1349 mobj = re.search(
1350 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1351 webpage)
1352 if mobj is not None:
1353 url = unescapeHTML(mobj.group('url'))
1354 return self.url_result(url)
1355
1356 # Look for embedded vulture.com player
1357 mobj = re.search(
1358 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1359 webpage)
1360 if mobj is not None:
1361 url = unescapeHTML(mobj.group('url'))
1362 return self.url_result(url, ie='Vulture')
1363
1364 # Look for embedded mtvservices player
1365 mobj = re.search(
1366 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1367 webpage)
1368 if mobj is not None:
1369 url = unescapeHTML(mobj.group('url'))
1370 return self.url_result(url, ie='MTVServicesEmbedded')
1371
1372 # Look for embedded yahoo player
1373 mobj = re.search(
1374 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1375 webpage)
1376 if mobj is not None:
1377 return self.url_result(mobj.group('url'), 'Yahoo')
1378
1379 # Look for embedded sbs.com.au player
1380 mobj = re.search(
1381 r'''(?x)
1382 (?:
1383 <meta\s+property="og:video"\s+content=|
1384 <iframe[^>]+?src=
1385 )
1386 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1387 webpage)
1388 if mobj is not None:
1389 return self.url_result(mobj.group('url'), 'SBS')
1390
1391 # Look for embedded Cinchcast player
1392 mobj = re.search(
1393 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1394 webpage)
1395 if mobj is not None:
1396 return self.url_result(mobj.group('url'), 'Cinchcast')
1397
1398 mobj = re.search(
1399 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1400 webpage)
1401 if not mobj:
1402 mobj = re.search(
1403 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1404 webpage)
1405 if mobj is not None:
1406 return self.url_result(mobj.group('url'), 'MLB')
1407
1408 mobj = re.search(
1409 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1410 webpage)
1411 if mobj is not None:
1412 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1413
1414 mobj = re.search(
1415 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1416 webpage)
1417 if mobj is not None:
1418 return self.url_result(mobj.group('url'), 'Livestream')
1419
1420 # Look for Zapiks embed
1421 mobj = re.search(
1422 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1423 if mobj is not None:
1424 return self.url_result(mobj.group('url'), 'Zapiks')
1425
1426 # Look for Kaltura embeds
1427 mobj = re.search(
1428 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1429 if mobj is not None:
1430 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1431
1432 # Look for Eagle.Platform embeds
1433 mobj = re.search(
1434 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1435 if mobj is not None:
1436 return self.url_result(mobj.group('url'), 'EaglePlatform')
1437
1438 # Look for ClipYou (uses Eagle.Platform) embeds
1439 mobj = re.search(
1440 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1441 if mobj is not None:
1442 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1443
1444 # Look for Pladform embeds
1445 mobj = re.search(
1446 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1447 if mobj is not None:
1448 return self.url_result(mobj.group('url'), 'Pladform')
1449
1450 # Look for Playwire embeds
1451 mobj = re.search(
1452 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1453 if mobj is not None:
1454 return self.url_result(mobj.group('url'))
1455
1456 # Look for 5min embeds
1457 mobj = re.search(
1458 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1459 if mobj is not None:
1460 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1461
1462 # Look for Crooks and Liars embeds
1463 mobj = re.search(
1464 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1465 if mobj is not None:
1466 return self.url_result(mobj.group('url'))
1467
1468 # Look for NBC Sports VPlayer embeds
1469 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1470 if nbc_sports_url:
1471 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1472
1473 # Look for UDN embeds
1474 mobj = re.search(
1475 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1476 if mobj is not None:
1477 return self.url_result(
1478 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1479
1480 # Look for Senate ISVP iframe
1481 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1482 if senate_isvp_url:
1483 return self.url_result(senate_isvp_url, 'SenateISVP')
1484
1485 def check_video(vurl):
1486 if YoutubeIE.suitable(vurl):
1487 return True
1488 vpath = compat_urlparse.urlparse(vurl).path
1489 vext = determine_ext(vpath)
1490 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1491
1492 def filter_video(urls):
1493 return list(filter(check_video, urls))
1494
1495 # Start with something easy: JW Player in SWFObject
1496 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1497 if not found:
1498 # Look for gorilla-vid style embedding
1499 found = filter_video(re.findall(r'''(?sx)
1500 (?:
1501 jw_plugins|
1502 JWPlayerOptions|
1503 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1504 )
1505 .*?
1506 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1507 if not found:
1508 # Broaden the search a little bit
1509 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1510 if not found:
1511 # Broaden the findall a little bit: JWPlayer JS loader
1512 found = filter_video(re.findall(
1513 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1514 if not found:
1515 # Flow player
1516 found = filter_video(re.findall(r'''(?xs)
1517 flowplayer\("[^"]+",\s*
1518 \{[^}]+?\}\s*,
1519 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1520 ["']?url["']?\s*:\s*["']([^"']+)["']
1521 ''', webpage))
1522 if not found:
1523 # Cinerama player
1524 found = re.findall(
1525 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1526 if not found:
1527 # Try to find twitter cards info
1528 found = filter_video(re.findall(
1529 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1530 if not found:
1531 # We look for Open Graph info:
1532 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1533 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1534 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1535 if m_video_type is not None:
1536 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1537 if not found:
1538 # HTML5 video
1539 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1540 if not found:
1541 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1542 found = re.search(
1543 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1544 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1545 webpage)
1546 if not found:
1547 # Look also in Refresh HTTP header
1548 refresh_header = head_response.headers.get('Refresh')
1549 if refresh_header:
1550 found = re.search(REDIRECT_REGEX, refresh_header)
1551 if found:
1552 new_url = compat_urlparse.urljoin(url, found.group(1))
1553 self.report_following_redirect(new_url)
1554 return {
1555 '_type': 'url',
1556 'url': new_url,
1557 }
1558 if not found:
1559 raise UnsupportedError(url)
1560
1561 entries = []
1562 for video_url in found:
1563 video_url = compat_urlparse.urljoin(url, video_url)
1564 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1565
1566 # Sometimes, jwplayer extraction will result in a YouTube URL
1567 if YoutubeIE.suitable(video_url):
1568 entries.append(self.url_result(video_url, 'Youtube'))
1569 continue
1570
1571 # here's a fun little line of code for you:
1572 video_id = os.path.splitext(video_id)[0]
1573
1574 if determine_ext(video_url) == 'smil':
1575 entries.append({
1576 'id': video_id,
1577 'formats': self._extract_smil_formats(video_url, video_id),
1578 'uploader': video_uploader,
1579 'title': video_title,
1580 'age_limit': age_limit,
1581 })
1582 else:
1583 entries.append({
1584 'id': video_id,
1585 'url': video_url,
1586 'uploader': video_uploader,
1587 'title': video_title,
1588 'age_limit': age_limit,
1589 })
1590
1591 if len(entries) == 1:
1592 return entries[0]
1593 else:
1594 for num, e in enumerate(entries, start=1):
1595 # 'url' results don't have a title
1596 if e.get('title') is not None:
1597 e['title'] = '%s (%d)' % (e['title'], num)
1598 return {
1599 '_type': 'playlist',
1600 'entries': entries,
1601 }