]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[pbs] Add support for subtitles (Closes #6184)
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
1ddb9456
S
12 compat_urllib_parse_unquote,
13 compat_urllib_request,
a5caba1e 14 compat_urlparse,
f7300c5c 15 compat_xml_parse_error,
8c25f81b
PH
16)
17from ..utils import (
b759a0d4 18 determine_ext,
9b122384 19 ExtractorError,
c8e9a235 20 float_or_none,
aa94a6d3 21 HEADRequest,
61ca9a80 22 is_html,
ed2d6a19 23 orderedSet,
bcf89ce6 24 parse_xml,
9d4660ca
PH
25 smuggle_url,
26 unescapeHTML,
42393ce2 27 unified_strdate,
4d54ef20 28 unsmuggle_url,
416c7fcb 29 UnsupportedError,
42393ce2 30 url_basename,
76c73715 31 xpath_text,
9b122384 32)
cfe50f04 33from .brightcove import BrightcoveIE
a2edf2e7 34from .nbc import NBCSportsVPlayerIE
c0d0b01f 35from .ooyala import OoyalaIE
93d020dd 36from .rutv import RUTVIE
954c1d05 37from .tvc import TVCIE
d40a3b5b 38from .sportbox import SportBoxEmbedIE
cb3ac1c6 39from .smotri import SmotriIE
6dd94d3a 40from .myvi import MyviIE
1419fafd 41from .condenast import CondeNastIE
418c5cc3 42from .udn import UDNEmbedIE
2fe1b5bd 43from .senateisvp import SenateISVPIE
0954cd8a 44from .bliptv import BlipTVIE
bab19a8e 45from .svt import SVTIE
65d161c4 46from .pornhub import PornHubIE
2bb5b6d0 47from .xhamster import XHamsterEmbedIE
b407e173 48from .vimeo import VimeoIE
756f574e 49from .dailymotion import DailymotionCloudIE
1ac1c4c2 50from .onionstudios import OnionStudiosIE
eedd20ef 51from .snagfilms import SnagFilmsEmbedIE
9b122384 52
0838239e 53
9b122384 54class GenericIE(InfoExtractor):
79649588 55 IE_DESC = 'Generic downloader that works on some sites'
9b122384 56 _VALID_URL = r'.*'
79649588 57 IE_NAME = 'generic'
cfe50f04 58 _TESTS = [
c5fa81fe
S
59 # Direct link to a video
60 {
61 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
62 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
63 'info_dict': {
64 'id': 'trailer',
65 'ext': 'mp4',
66 'title': 'trailer',
67 'upload_date': '20100513',
68 }
69 },
c5138a7c 70 # Direct link to media delivered compressed (until Accept-Encoding is *)
c5fa81fe
S
71 {
72 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
73 'md5': '128c42e68b13950268b648275386fc74',
74 'info_dict': {
75 'id': 'FictionJunction-Parallel_Hearts',
76 'ext': 'flac',
77 'title': 'FictionJunction-Parallel_Hearts',
78 'upload_date': '20140522',
79 },
80 'expected_warnings': [
81 'URL could be a direct video link, returning it as such.'
82 ]
83 },
84 # Direct download with broken HEAD
85 {
86 'url': 'http://ai-radio.org:8000/radio.opus',
87 'info_dict': {
88 'id': 'radio',
89 'ext': 'opus',
90 'title': 'radio',
91 },
92 'params': {
93 'skip_download': True, # infinite live stream
94 },
95 'expected_warnings': [
96 r'501.*Not Implemented'
97 ],
98 },
99 # Direct link with incorrect MIME type
100 {
101 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
102 'md5': '4ccbebe5f36706d85221f204d7eb5913',
103 'info_dict': {
104 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
105 'id': '5_Lennart_Poettering_-_Systemd',
106 'ext': 'webm',
107 'title': '5_Lennart_Poettering_-_Systemd',
108 'upload_date': '20141120',
109 },
110 'expected_warnings': [
111 'URL could be a direct video link, returning it as such.'
112 ]
113 },
114 # RSS feed
115 {
116 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
117 'info_dict': {
118 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
119 'title': 'Zero Punctuation',
120 'description': 're:.*groundbreaking video review series.*'
121 },
122 'playlist_mincount': 11,
123 },
124 # RSS feed with enclosure
125 {
126 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
127 'info_dict': {
128 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
129 'ext': 'm4v',
130 'upload_date': '20150228',
131 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
132 }
133 },
134 # google redirect
135 {
136 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
137 'info_dict': {
138 'id': 'cmQHVoWB5FY',
139 'ext': 'mp4',
140 'upload_date': '20130224',
141 'uploader_id': 'TheVerge',
142 'description': 're:^Chris Ziegler takes a look at the\.*',
143 'uploader': 'The Verge',
144 'title': 'First Firefox OS phones side-by-side',
145 },
146 'params': {
147 'skip_download': False,
148 }
149 },
cfe50f04 150 {
79649588 151 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 152 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 153 'info_dict': {
d360a146
S
154 'id': '13601338388002',
155 'ext': 'mp4',
79649588
PH
156 'uploader': 'www.hodiho.fr',
157 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
158 }
159 },
c19f7764
JMF
160 # bandcamp page with custom domain
161 {
79649588
PH
162 'add_ie': ['Bandcamp'],
163 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 164 'info_dict': {
fd50bf62
S
165 'id': '3235767654',
166 'ext': 'mp3',
79649588
PH
167 'title': 'The Pony Mash',
168 'uploader': 'M_Pallante',
c19f7764 169 },
79649588 170 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 171 },
eeb165e6 172 # embedded brightcove video
dd5bcdc4
JMF
173 # it also tests brightcove videos that need to set the 'Referer' in the
174 # http requests
eeb165e6 175 {
79649588
PH
176 'add_ie': ['Brightcove'],
177 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
178 'info_dict': {
179 'id': '2765128793001',
180 'ext': 'mp4',
181 'title': 'Le cours de bourse : l’analyse technique',
182 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
183 'uploader': 'BFM BUSINESS',
eeb165e6 184 },
79649588
PH
185 'params': {
186 'skip_download': True,
eeb165e6
JMF
187 },
188 },
17ab4d3b
PH
189 {
190 # https://github.com/rg3/youtube-dl/issues/2253
191 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
192 'md5': '0ba9446db037002366bab3b3eb30c88c',
193 'info_dict': {
fd50bf62
S
194 'id': '3101154703001',
195 'ext': 'mp4',
17ab4d3b
PH
196 'title': 'Still no power',
197 'uploader': 'thestar.com',
198 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
199 },
200 'add_ie': ['Brightcove'],
201 },
0479c625
S
202 {
203 'url': 'http://www.championat.com/video/football/v/87/87499.html',
204 'md5': 'fb973ecf6e4a78a67453647444222983',
205 'info_dict': {
206 'id': '3414141473001',
207 'ext': 'mp4',
208 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
209 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
210 'uploader': 'Championat',
211 },
212 },
bdf97017 213 {
37aab278 214 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
215 'add_ie': ['Brightcove'],
216 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
217 'info_dict': {
218 'id': '3866516442001',
37aab278 219 'ext': 'mp4',
bdf97017
NJ
220 'title': 'Leer mij vrouwen kennen: Aflevering 1',
221 'description': 'Leer mij vrouwen kennen: Aflevering 1',
222 'uploader': 'SBS Broadcasting',
223 },
37aab278 224 'skip': 'Restricted to Netherlands',
bdf97017 225 'params': {
37aab278 226 'skip_download': True, # m3u8 download
bdf97017
NJ
227 },
228 },
c0d0b01f
JMF
229 # ooyala video
230 {
79649588 231 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 232 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
233 'info_dict': {
234 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
235 'ext': 'mp4',
3486df38 236 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 237 },
87830900 238 'add_ie': ['Ooyala'],
c0d0b01f 239 },
f076b638 240 # multiple ooyala embeds on SBN network websites
241 {
242 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
243 'info_dict': {
244 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
245 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
246 },
247 'playlist_mincount': 3,
248 'params': {
249 'skip_download': True,
250 },
251 'add_ie': ['Ooyala'],
252 },
1b86cc41 253 # embed.ly video
254 {
255 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
256 'info_dict': {
257 'id': '9ODmcdjQcHQ',
258 'ext': 'mp4',
0a5bce56
PH
259 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
260 'upload_date': '20140225',
261 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
262 'uploader': 'Tested',
263 'uploader_id': 'testedcom',
1b86cc41 264 },
265 # No need to test YoutubeIE here
266 'params': {
267 'skip_download': True,
268 },
269 },
60cc4dc4
PH
270 # funnyordie embed
271 {
272 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
273 'info_dict': {
274 'id': '18e820ec3f',
275 'ext': 'mp4',
276 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
277 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 278 },
60cc4dc4 279 },
faa4ea68
S
280 # BBC iPlayer embeds
281 {
282 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
283 'info_dict': {
284 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
285 },
286 'playlist_mincount': 18,
287 },
93d020dd
S
288 # RUTV embed
289 {
290 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
291 'info_dict': {
292 'id': '776940',
293 'ext': 'mp4',
294 'title': 'Охотское море стало целиком российским',
295 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
296 },
297 'params': {
298 # m3u8 download
299 'skip_download': True,
300 },
aab74fa1 301 },
f37bdbe5
S
302 # TVC embed
303 {
304 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
305 'info_dict': {
306 'id': '55304',
307 'ext': 'mp4',
308 'title': 'Дошкольное воспитание',
309 },
310 },
b827a601
S
311 # SportBox embed
312 {
313 'url': 'http://www.vestifinance.ru/articles/25753',
314 'info_dict': {
315 'id': '25753',
316 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
317 },
318 'playlist': [{
319 'info_dict': {
320 'id': '370908',
321 'title': 'Госзаказ. День 3',
322 'ext': 'mp4',
323 }
324 }, {
325 'info_dict': {
326 'id': '370905',
327 'title': 'Госзаказ. День 2',
328 'ext': 'mp4',
329 }
330 }, {
331 'info_dict': {
332 'id': '370902',
333 'title': 'Госзаказ. День 1',
334 'ext': 'mp4',
335 }
336 }],
337 'params': {
338 # m3u8 download
339 'skip_download': True,
340 },
341 },
c76799c5
S
342 # XHamster embed
343 {
344 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
345 'info_dict': {
346 'id': 'showthread',
347 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
348 },
349 'playlist_mincount': 7,
350 },
aab74fa1
PH
351 # Embedded TED video
352 {
353 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 354 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 355 'info_dict': {
a8eb5a8e 356 'id': '1969',
aab74fa1 357 'ext': 'mp4',
a8eb5a8e
PH
358 'title': 'Hidden miracles of the natural world',
359 'uploader': 'Louie Schwartzberg',
360 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 361 }
60cc4dc4 362 },
5c386252 363 # Embeded Ustream video
364 {
365 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
366 'md5': '27b99cdb639c9b12a79bca876a073417',
367 'info_dict': {
ca6aada4 368 'id': '45734260',
369 'ext': 'flv',
370 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 371 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
372 }
373 },
d95e35d6
S
374 # nowvideo embed hidden behind percent encoding
375 {
376 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
377 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
378 'info_dict': {
379 'id': '06e53103ca9aa',
380 'ext': 'flv',
381 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
382 'description': 'No description',
383 },
0f2a2ba1 384 },
893f8832
PH
385 # arte embed
386 {
387 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
388 'md5': '7653032cbb25bf6c80d80f217055fa43',
389 'info_dict': {
390 'id': '048195-004_PLUS7-F',
391 'ext': 'flv',
392 'title': 'X:enius',
393 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
394 'upload_date': '20140320',
395 },
396 'params': {
397 'skip_download': 'Requires rtmpdump'
398 }
399 },
fa35cdad
PH
400 # Condé Nast embed
401 {
402 'url': 'http://www.wired.com/2014/04/honda-asimo/',
403 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
404 'info_dict': {
405 'id': '53501be369702d3275860000',
406 'ext': 'mp4',
407 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
408 }
ebd3c7b3
PH
409 },
410 # Dailymotion embed
411 {
412 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
413 'md5': '441aeeb82eb72c422c7f14ec533999cd',
414 'info_dict': {
415 'id': 'k2mm4bCdJ6CQ2i7c8o2',
416 'ext': 'mp4',
417 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
418 'uploader': 'Spi0n',
419 },
420 'add_ie': ['Dailymotion'],
2b88feed
PH
421 },
422 # YouTube embed
423 {
424 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
425 'info_dict': {
426 'id': 'FXRb4ykk4S0',
427 'ext': 'mp4',
428 'title': 'The NBL Auction 2014',
429 'uploader': 'BADMINTON England',
430 'uploader_id': 'BADMINTONEvents',
431 'upload_date': '20140603',
432 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
433 },
434 'add_ie': ['Youtube'],
435 'params': {
436 'skip_download': True,
437 }
438 },
c5cd249e
JMF
439 # MTVSercices embed
440 {
441 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
442 'md5': '35727f82f58c76d996fc188f9755b0d5',
443 'info_dict': {
444 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
445 'ext': 'mp4',
446 'title': 'Review',
447 'description': 'Mario\'s life in the fast lane has never looked so good.',
448 },
449 },
61013473 450 # YouTube embed via <data-embed-url="">
451 {
452 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 453 'info_dict': {
a8eb5a8e 454 'id': '4vAffPZIT44',
61013473 455 'ext': 'mp4',
a8eb5a8e 456 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
457 'uploader': 'Gameloft',
458 'uploader_id': 'gameloft',
a8eb5a8e
PH
459 'upload_date': '20140828',
460 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
461 },
462 'params': {
463 'skip_download': True,
61013473 464 }
c8e9a235
PH
465 },
466 # Camtasia studio
467 {
468 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
469 'playlist': [{
470 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
471 'info_dict': {
472 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
473 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
474 'ext': 'flv',
475 'duration': 2235.90,
476 }
477 }, {
478 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
479 'info_dict': {
480 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
481 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
482 'ext': 'flv',
483 'duration': 2235.93,
484 }
485 }],
486 'info_dict': {
487 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
488 }
4d805e06
PH
489 },
490 # Flowplayer
491 {
492 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
493 'md5': '9d65602bf31c6e20014319c7d07fba27',
494 'info_dict': {
495 'id': '5123ea6d5e5a7',
496 'ext': 'mp4',
497 'age_limit': 18,
498 'uploader': 'www.handjobhub.com',
d6d9186f 499 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 500 }
0990305d 501 },
22a6f150
PH
502 # Multiple brightcove videos
503 # https://github.com/rg3/youtube-dl/issues/2283
504 {
505 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
506 'info_dict': {
507 'id': 'always-never',
508 'title': 'Always / Never - The New Yorker',
509 },
510 'playlist_count': 3,
511 'params': {
512 'extract_flat': False,
513 'skip_download': True,
514 }
1a94ff68
S
515 },
516 # MLB embed
517 {
518 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
519 'md5': '96f09a37e44da40dd083e12d9a683327',
520 'info_dict': {
521 'id': '33322633',
522 'ext': 'mp4',
523 'title': 'Ump changes call to ball',
524 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
525 'duration': 48,
526 'timestamp': 1401537900,
527 'upload_date': '20140531',
528 'thumbnail': 're:^https?://.*\.jpg$',
529 },
530 },
746c67d7
NJ
531 # Wistia embed
532 {
533 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
534 'md5': '8788b683c777a5cf25621eaf286d0c23',
535 'info_dict': {
536 'id': '1cfaf6b7ea',
537 'ext': 'mov',
538 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
539 'duration': 643.0,
540 'filesize': 182808282,
541 'uploader': 'education-portal.com',
542 },
543 },
52cffcb1 544 {
545 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
546 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
547 'info_dict': {
548 'id': 'uxjb0lwrcz',
549 'ext': 'mp4',
85d7b765 550 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 551 'duration': 1715.0,
85d7b765 552 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 553 },
52cffcb1 554 },
ac645ac7
PH
555 # Soundcloud embed
556 {
557 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
558 'info_dict': {
559 'id': '174391317',
560 'ext': 'mp3',
561 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
562 'uploader': 'Sophos Security',
563 'title': 'Chet Chat 171 - Oct 29, 2014',
564 'upload_date': '20141029',
565 }
af63fed7
PH
566 },
567 # Livestream embed
568 {
569 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
570 'info_dict': {
571 'id': '67864563',
572 'ext': 'flv',
573 'upload_date': '20141112',
574 'title': 'Rosetta #CometLanding webcast HL 10',
575 }
576 },
65f3a228
PH
577 # LazyYT
578 {
579 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
580 'info_dict': {
11e611a7 581 'id': '1986',
65f3a228
PH
582 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
583 },
584 'playlist_mincount': 2,
4e262a88 585 },
42bdd9d0
PH
586 # Cinchcast embed
587 {
588 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
589 'info_dict': {
590 'id': '7141703',
591 'ext': 'mp3',
592 'upload_date': '20141126',
593 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
594 }
595 },
501f13fb
PH
596 # Cinerama player
597 {
598 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
599 'info_dict': {
600 'id': '730m_DandD_1901_512k',
601 'ext': 'mp4',
602 'uploader': 'www.abc.net.au',
603 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
604 }
796df3c6
S
605 },
606 # embedded viddler video
607 {
608 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
609 'info_dict': {
610 'id': '4d03aad9',
611 'ext': 'mp4',
612 'uploader': 'deadspin',
613 'title': 'WALL-TO-GORTAT',
614 'timestamp': 1422285291,
615 'upload_date': '20150126',
616 },
617 'add_ie': ['Viddler'],
a0f71985 618 },
2051acde
S
619 # Libsyn embed
620 {
621 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
622 'info_dict': {
623 'id': '3377616',
624 'ext': 'mp3',
625 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
626 'description': 'md5:601cb790edd05908957dae8aaa866465',
627 'upload_date': '20150220',
628 },
629 },
a0f71985
PH
630 # jwplayer YouTube
631 {
632 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
633 'info_dict': {
634 'id': 'Mrj4DVp2zeA',
635 'ext': 'mp4',
f37e3f99 636 'upload_date': '20150212',
a0f71985
PH
637 'uploader': 'The National Archives UK',
638 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
639 'uploader_id': 'NationalArchives08',
640 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
641 },
59b8ab58
PH
642 },
643 # rtl.nl embed
644 {
645 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
646 'playlist_mincount': 5,
647 'info_dict': {
648 'id': 'aanslagen-kopenhagen',
649 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
650 }
255fca5e
S
651 },
652 # Zapiks embed
653 {
654 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
655 'info_dict': {
656 'id': '118046',
657 'ext': 'mp4',
658 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
659 }
660 },
e3216b82
NJ
661 # Kaltura embed
662 {
663 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
664 'info_dict': {
665 'id': '1_eergr3h1',
666 'ext': 'mp4',
667 'upload_date': '20150226',
668 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
669 'timestamp': int,
670 'title': 'John Carlson Postgame 2/25/15',
671 },
672 },
66e568de
S
673 # Kaltura embed (different embed code)
674 {
675 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
676 'info_dict': {
677 'id': '1_a52wc67y',
678 'ext': 'flv',
679 'upload_date': '20150127',
680 'uploader_id': 'PremierMedia',
681 'timestamp': int,
682 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
683 },
684 },
135c9c42
S
685 # Eagle.Platform embed (generic URL)
686 {
687 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
688 'info_dict': {
689 'id': '227304',
690 'ext': 'mp4',
691 'title': 'Навальный вышел на свободу',
692 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
693 'thumbnail': 're:^https?://.*\.jpg$',
694 'duration': 87,
695 'view_count': int,
696 'age_limit': 0,
697 },
698 },
d47ae7f6
S
699 # ClipYou (Eagle.Platform) embed (custom URL)
700 {
701 'url': 'http://muz-tv.ru/play/7129/',
702 'info_dict': {
703 'id': '12820',
704 'ext': 'mp4',
705 'title': "'O Sole Mio",
706 'thumbnail': 're:^https?://.*\.jpg$',
707 'duration': 216,
708 'view_count': int,
709 },
710 },
f8388757
S
711 # Pladform embed
712 {
713 'url': 'http://muz-tv.ru/kinozal/view/7400/',
714 'info_dict': {
715 'id': '100183293',
716 'ext': 'mp4',
62259846 717 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
f8388757
S
718 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
719 'thumbnail': 're:^https?://.*\.jpg$',
720 'duration': 694,
721 'age_limit': 0,
722 },
723 },
c798f15b
S
724 # Playwire embed
725 {
726 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
727 'info_dict': {
728 'id': '3519514',
729 'ext': 'mp4',
730 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
731 'thumbnail': 're:^https?://.*\.png$',
732 'duration': 45.115,
733 },
734 },
ad320e9b
NJ
735 # 5min embed
736 {
737 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
738 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
739 'info_dict': {
740 'id': '518726732',
741 'ext': 'mp4',
742 'title': 'Facebook Creates "On This Day" | Crunch Report',
743 },
744 },
dc455a5f
S
745 # SVT embed
746 {
747 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
748 'info_dict': {
749 'id': '2900353',
750 'ext': 'flv',
751 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
752 'duration': 27,
753 'age_limit': 0,
754 },
755 },
a4257017
S
756 # Crooks and Liars embed
757 {
758 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
759 'info_dict': {
760 'id': '8RUoRhRi',
761 'ext': 'mp4',
762 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
763 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
764 'timestamp': 1428207000,
765 'upload_date': '20150405',
766 'uploader': 'Heather',
767 },
768 },
769 # Crooks and Liars external embed
770 {
771 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
772 'info_dict': {
773 'id': 'MTE3MjUtMzQ2MzA',
774 'ext': 'mp4',
775 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
776 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
777 'timestamp': 1265032391,
778 'upload_date': '20100201',
779 'uploader': 'Heather',
780 },
781 },
facecb84 782 # NBC Sports vplayer embed
a2edf2e7 783 {
facecb84 784 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
a2edf2e7 785 'info_dict': {
facecb84
S
786 'id': 'ln7x1qSThw4k',
787 'ext': 'flv',
788 'title': "PFT Live: New leader in the 'new-look' defense",
789 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
a2edf2e7 790 },
418c5cc3
YCH
791 },
792 # UDN embed
793 {
794 'url': 'http://www.udn.com/news/story/7314/822787',
01c58f84 795 'md5': 'fd2060e988c326991037b9aff9df21a6',
418c5cc3 796 'info_dict': {
01c58f84 797 'id': '300346',
418c5cc3 798 'ext': 'mp4',
01c58f84 799 'title': '中一中男師變性 全校師生力挺',
418c5cc3
YCH
800 'thumbnail': 're:^https?://.*\.jpg$',
801 }
edfcf7ab
YCH
802 },
803 # Ooyala embed
804 {
805 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
806 'info_dict': {
807 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
808 'ext': 'mp4',
809 'description': 'VIDEO: Index/Match versus VLOOKUP.',
810 'title': 'This is what separates the Excel masters from the wannabes',
811 },
812 'params': {
813 # m3u8 downloads
814 'skip_download': True,
815 }
d6fd958c
YCH
816 },
817 # Contains a SMIL manifest
818 {
819 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
820 'info_dict': {
821 'id': 'file',
822 'ext': 'flv',
823 'title': '+ Football: Lottery Champions League Europe',
824 'uploader': 'www.telewebion.com',
825 },
826 'params': {
827 # rtmpe downloads
828 'skip_download': True,
829 }
b26733ba
YCH
830 },
831 # Brightcove URL in single quotes
832 {
833 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
834 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
835 'info_dict': {
836 'id': '4255764656001',
837 'ext': 'mp4',
838 'title': 'SN Presents: Russell Martin, World Citizen',
839 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
840 'uploader': 'Rogers Sportsnet',
841 },
756f574e
YCH
842 },
843 # Dailymotion Cloud video
844 {
845 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
846 'md5': '49444254273501a64675a7e68c502681',
847 'info_dict': {
848 'id': '5585de919473990de4bee11b',
849 'ext': 'mp4',
850 'title': 'Le débat',
851 'thumbnail': 're:^https?://.*\.jpe?g$',
852 }
a5158f38 853 },
8084be78
S
854 # OnionStudios embed
855 {
856 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
857 'info_dict': {
858 'id': '2855',
859 'ext': 'mp4',
860 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
861 'thumbnail': 're:^https?://.*\.jpe?g$',
862 'uploader': 'ClickHole',
863 'uploader_id': 'clickhole',
864 }
865 },
b8c1cc1a
S
866 # SnagFilms embed
867 {
868 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
869 'info_dict': {
870 'id': '74849a00-85a9-11e1-9660-123139220831',
871 'ext': 'mp4',
872 'title': '#whilewewatch',
873 }
874 },
a5158f38
YCH
875 # AdobeTVVideo embed
876 {
877 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
878 'md5': '43662b577c018ad707a63766462b1e87',
879 'info_dict': {
880 'id': '2456',
881 'ext': 'mp4',
882 'title': 'New experience with Acrobat DC',
883 'description': 'New experience with Acrobat DC',
884 'duration': 248.667,
885 },
76c73715 886 }
cfe50f04 887 ]
9b122384 888
9b122384
PH
889 def report_following_redirect(self, new_url):
890 """Report information extraction."""
79649588 891 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 892
4fc946b5
PH
893 def _extract_rss(self, url, video_id, doc):
894 playlist_title = doc.find('./channel/title').text
895 playlist_desc_el = doc.find('./channel/description')
896 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
897
76c73715
PH
898 entries = []
899 for it in doc.findall('./channel/item'):
900 next_url = xpath_text(it, 'link', fatal=False)
901 if not next_url:
902 enclosure_nodes = it.findall('./enclosure')
903 for e in enclosure_nodes:
904 next_url = e.attrib.get('url')
905 if next_url:
906 break
907
908 if not next_url:
909 continue
910
911 entries.append({
912 '_type': 'url',
913 'url': next_url,
914 'title': it.find('title').text,
915 })
4fc946b5
PH
916
917 return {
918 '_type': 'playlist',
919 'id': url,
920 'title': playlist_title,
921 'description': playlist_desc,
922 'entries': entries,
923 }
924
c8e9a235
PH
925 def _extract_camtasia(self, url, video_id, webpage):
926 """ Returns None if no camtasia video can be found. """
927
928 camtasia_cfg = self._search_regex(
929 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
930 webpage, 'camtasia configuration file', default=None)
931 if camtasia_cfg is None:
932 return None
933
934 title = self._html_search_meta('DC.title', webpage, fatal=True)
935
936 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
937 camtasia_cfg = self._download_xml(
938 camtasia_url, video_id,
939 note='Downloading camtasia configuration',
940 errnote='Failed to download camtasia configuration')
941 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
942
943 entries = []
944 for n in fileset_node.getchildren():
945 url_n = n.find('./uri')
946 if url_n is None:
947 continue
948
949 entries.append({
950 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
951 'title': '%s - %s' % (title, n.tag),
952 'url': compat_urlparse.urljoin(url, url_n.text),
953 'duration': float_or_none(n.find('./duration').text),
954 })
955
956 return {
957 '_type': 'playlist',
958 'entries': entries,
959 'title': title,
960 }
961
9b122384 962 def _real_extract(self, url):
ebd3c7b3
PH
963 if url.startswith('//'):
964 return {
965 '_type': 'url',
20991253 966 'url': self.http_scheme() + url,
ebd3c7b3
PH
967 }
968
a7130543
JMF
969 parsed_url = compat_urlparse.urlparse(url)
970 if not parsed_url.scheme:
04b4d394
PH
971 default_search = self._downloader.params.get('default_search')
972 if default_search is None:
1f7ccb90 973 default_search = 'fixup_error'
04b4d394 974
1f7ccb90 975 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
976 if '/' in url:
977 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
978 return self.url_result('http://' + url)
1f7ccb90 979 elif default_search != 'fixup_error':
9c1fc022 980 if default_search == 'auto_warning':
0e67ab0d
PH
981 if re.match(r'^(?:url|URL)$', url):
982 raise ExtractorError(
983 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
984 expected=True)
985 else:
986 self._downloader.report_warning(
7571c02c 987 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 988 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
989
990 if default_search in ('error', 'fixup_error'):
7571c02c 991 raise ExtractorError(
b74e86f4
PH
992 '%r is not a valid URL. '
993 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
994 % (url, url), expected=True)
04b4d394 995 else:
f2f2c0c2
PH
996 if ':' not in default_search:
997 default_search += ':'
04b4d394 998 return self.url_result(default_search + url)
4d54ef20
PH
999
1000 url, smuggled_data = unsmuggle_url(url)
1001 force_videoid = None
d6e6a422 1002 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
1003 if smuggled_data and 'force_videoid' in smuggled_data:
1004 force_videoid = smuggled_data['force_videoid']
1005 video_id = force_videoid
1006 else:
1ddb9456 1007 video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
a7130543 1008
79649588 1009 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 1010
ebab4520 1011 head_req = HEADRequest(url)
23be51d8 1012 head_response = self._request_webpage(
ebab4520
PH
1013 head_req, video_id,
1014 note=False, errnote='Could not send HEAD request to %s' % url,
1015 fatal=False)
42393ce2 1016
23be51d8 1017 if head_response is not False:
42393ce2 1018 # Check for redirect
23be51d8 1019 new_url = head_response.geturl()
42393ce2
PH
1020 if url != new_url:
1021 self.report_following_redirect(new_url)
4d54ef20
PH
1022 if force_videoid:
1023 new_url = smuggle_url(
1024 new_url, {'force_videoid': force_videoid})
cecaaf3f 1025 return self.url_result(new_url)
42393ce2 1026
23be51d8
PH
1027 full_response = None
1028 if head_response is False:
58bde34a
S
1029 request = compat_urllib_request.Request(url)
1030 request.add_header('Accept-Encoding', '*')
1031 full_response = self._request_webpage(request, video_id)
23be51d8
PH
1032 head_response = full_response
1033
1034 # Check for direct link to a video
1035 content_type = head_response.headers.get('Content-Type', '')
1036 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
1037 if m:
1038 upload_date = unified_strdate(
1039 head_response.headers.get('Last-Modified'))
1040 return {
1041 'id': video_id,
1ddb9456 1042 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
ccdd0ffb 1043 'direct': True,
23be51d8
PH
1044 'formats': [{
1045 'format_id': m.group('format_id'),
1046 'url': url,
1047 'vcodec': 'none' if m.group('type') == 'audio' else None
1048 }],
1049 'upload_date': upload_date,
1050 }
42393ce2 1051
d6e6a422 1052 if not self._downloader.params.get('test', False) and not is_intentional:
2fece970
S
1053 force = self._downloader.params.get('force_generic_extractor', False)
1054 self._downloader.report_warning(
1055 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
d6e6a422 1056
4e262a88 1057 if not full_response:
58bde34a
S
1058 request = compat_urllib_request.Request(url)
1059 # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
1060 # making it impossible to download only chunk of the file (yet we need only 512kB to
1061 # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
1062 # that will always result in downloading the whole file that is not desirable.
1063 # Therefore for extraction pass we have to override Accept-Encoding to any in order
1064 # to accept raw bytes and being able to download only a chunk.
1065 # It may probably better to solve this by checking Content-Type for application/octet-stream
1066 # after HEAD request finishes, but not sure if we can rely on this.
1067 request.add_header('Accept-Encoding', '*')
1068 full_response = self._request_webpage(request, video_id)
4e262a88
PH
1069
1070 # Maybe it's a direct link to a video?
1071 # Be careful not to download the whole thing!
1072 first_bytes = full_response.read(512)
61ca9a80 1073 if not is_html(first_bytes):
4e262a88
PH
1074 self._downloader.report_warning(
1075 'URL could be a direct video link, returning it as such.')
1076 upload_date = unified_strdate(
1077 head_response.headers.get('Last-Modified'))
1078 return {
1079 'id': video_id,
1ddb9456 1080 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
4e262a88
PH
1081 'direct': True,
1082 'url': url,
1083 'upload_date': upload_date,
1084 }
1085
1086 webpage = self._webpage_read_content(
1087 full_response, url, video_id, prefix=first_bytes)
1088
9b122384 1089 self.report_extraction(video_id)
887c6acd 1090
4fc946b5
PH
1091 # Is it an RSS feed?
1092 try:
bcf89ce6 1093 doc = parse_xml(webpage)
4fc946b5
PH
1094 if doc.tag == 'rss':
1095 return self._extract_rss(url, video_id, doc)
f7300c5c 1096 except compat_xml_parse_error:
4fc946b5
PH
1097 pass
1098
c8e9a235
PH
1099 # Is it a Camtasia project?
1100 camtasia_res = self._extract_camtasia(url, video_id, webpage)
1101 if camtasia_res is not None:
1102 return camtasia_res
1103
14390730
S
1104 # Sometimes embedded video player is hidden behind percent encoding
1105 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1106 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
1107 webpage = compat_urllib_parse.unquote(webpage)
1108
887c6acd
PH
1109 # it's tempting to parse this further, but you would
1110 # have to take into account all the variations like
1111 # Video Title - Site Name
1112 # Site Name | Video Title
1113 # Video Title - Tagline | Site Name
1114 # and so on and so forth; it's just not practical
ef4fd848 1115 video_title = self._html_search_regex(
79649588
PH
1116 r'(?s)<title>(.*?)</title>', webpage, 'video title',
1117 default='video')
ef4fd848 1118
4d805e06
PH
1119 # Try to detect age limit automatically
1120 age_limit = self._rta_search(webpage)
1121 # And then there are the jokers who advertise that they use RTA,
1122 # but actually don't.
1123 AGE_LIMIT_MARKERS = [
1124 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1125 ]
1126 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1127 age_limit = 18
1128
ef4fd848
PH
1129 # video uploader is domain name
1130 video_uploader = self._search_regex(
79649588 1131 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 1132
ed2d6a19 1133 # Helper method
83992676 1134 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 1135 urlrs = orderedSet(
83992676 1136 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 1137 for m in matches)
ed2d6a19
PH
1138 return self.playlist_result(
1139 urlrs, playlist_id=video_id, playlist_title=video_title)
1140
627a91a9 1141 # Look for BrightCove:
99877772
PH
1142 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1143 if bc_urls:
79649588 1144 self.to_screen('Brightcove video detected.')
99877772
PH
1145 entries = [{
1146 '_type': 'url',
1147 'url': smuggle_url(bc_url, {'Referer': url}),
1148 'ie_key': 'Brightcove'
1149 } for bc_url in bc_urls]
1150
1151 return {
1152 '_type': 'playlist',
1153 'title': video_title,
1154 'id': video_id,
1155 'entries': entries,
1156 }
cfe50f04 1157
59b8ab58
PH
1158 # Look for embedded rtl.nl player
1159 matches = re.findall(
97b570a9 1160 r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
59b8ab58
PH
1161 webpage)
1162 if matches:
1163 return _playlist_from_matches(matches, ie='RtlNl')
1164
b407e173
YCH
1165 vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
1166 if vimeo_url is not None:
1167 return self.url_result(vimeo_url)
7115ca84 1168
53c1d3ef 1169 # Look for embedded YouTube player
1f9da904 1170 matches = re.findall(r'''(?x)
2b88feed
PH
1171 (?:
1172 <iframe[^>]+?src=|
c71dfccc 1173 data-video-url=|
2b88feed 1174 <embed[^>]+?src=|
a7e97f6d
PH
1175 embedSWF\(?:\s*|
1176 new\s+SWFObject\(
2b88feed
PH
1177 )
1178 (["\'])
1bf5423e 1179 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 1180 (?:embed|v|p)/.+?)
1f9da904 1181 \1''', webpage)
887c6acd 1182 if matches:
ed2d6a19 1183 return _playlist_from_matches(
3b2f933b 1184 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 1185
65f3a228
PH
1186 # Look for lazyYT YouTube embed
1187 matches = re.findall(
1188 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1189 if matches:
1190 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1191
355e4fd0
PH
1192 # Look for embedded Dailymotion player
1193 matches = re.findall(
ef4fd848 1194 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 1195 if matches:
ed2d6a19
PH
1196 return _playlist_from_matches(
1197 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 1198
8489578d
NJ
1199 # Look for embedded Dailymotion playlist player (#3822)
1200 m = re.search(
1201 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1202 if m:
1203 playlists = re.findall(
1204 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1205 if playlists:
1206 return _playlist_from_matches(
1207 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1208
ef4fd848
PH
1209 # Look for embedded Wistia player
1210 match = re.search(
281d3f1d 1211 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 1212 if match:
9471c444
NJ
1213 embed_url = self._proto_relative_url(
1214 unescapeHTML(match.group('url')))
ef4fd848
PH
1215 return {
1216 '_type': 'url_transparent',
9471c444 1217 'url': embed_url,
ef4fd848
PH
1218 'ie_key': 'Wistia',
1219 'uploader': video_uploader,
1220 'title': video_title,
1221 'id': video_id,
1222 }
5f6a1245 1223
9471c444 1224 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
1225 if match:
1226 return {
1227 '_type': 'url_transparent',
1228 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1229 'ie_key': 'Wistia',
1230 'uploader': video_uploader,
1231 'title': video_title,
1232 'id': match.group('id')
1233 }
ef4fd848 1234
ee3e63e4 1235 # Look for embedded blip.tv player
0954cd8a
YCH
1236 bliptv_url = BlipTVIE._extract_url(webpage)
1237 if bliptv_url:
1238 return self.url_result(bliptv_url, 'BlipTV')
ee3e63e4 1239
bab19a8e
S
1240 # Look for SVT player
1241 svt_url = SVTIE._extract_url(webpage)
1242 if svt_url:
1243 return self.url_result(svt_url, 'SVT')
1244
fa35cdad
PH
1245 # Look for embedded condenast player
1246 matches = re.findall(
1247 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1248 webpage)
1249 if matches:
1250 return {
1251 '_type': 'playlist',
1252 'entries': [{
1253 '_type': 'url',
1254 'ie_key': 'CondeNast',
1255 'url': ma,
1256 } for ma in matches],
1257 'title': video_title,
1258 'id': video_id,
1259 }
1260
c19f7764
JMF
1261 # Look for Bandcamp pages with custom domain
1262 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1263 if mobj is not None:
1264 burl = unescapeHTML(mobj.group(1))
09804265
JMF
1265 # Don't set the extractor because it can be a track url or an album
1266 return self.url_result(burl)
c19f7764 1267
f25571ff
PH
1268 # Look for embedded Vevo player
1269 mobj = re.search(
1270 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1271 if mobj is not None:
1272 return self.url_result(mobj.group('url'))
796df3c6
S
1273
1274 # Look for embedded Viddler player
cb454b33
S
1275 mobj = re.search(
1276 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1277 webpage)
796df3c6
S
1278 if mobj is not None:
1279 return self.url_result(mobj.group('url'))
f25571ff 1280
3378d67a
S
1281 # Look for NYTimes player
1282 mobj = re.search(
1283 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1284 webpage)
1285 if mobj is not None:
1286 return self.url_result(mobj.group('url'))
1287
cefdf970
S
1288 # Look for Libsyn player
1289 mobj = re.search(
1290 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1291 if mobj is not None:
1292 return self.url_result(mobj.group('url'))
1293
c0d0b01f 1294 # Look for Ooyala videos
cb454b33 1295 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 1296 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
edfcf7ab
YCH
1297 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1298 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
c0d0b01f 1299 if mobj is not None:
750f9020 1300 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 1301
f076b638 1302 # Look for multiple Ooyala embeds on SBN network websites
1303 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1304 if mobj is not None:
1305 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1306 if embeds:
1307 return _playlist_from_matches(
1308 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1309
aa94a6d3 1310 # Look for Aparat videos
48099643 1311 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
1312 if mobj is not None:
1313 return self.url_result(mobj.group(1), 'Aparat')
1314
c93c2ab1 1315 # Look for MPORA videos
c3f51436 1316 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
1317 if mobj is not None:
1318 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 1319
15c0e8e7 1320 # Look for embedded NovaMov-based player
8f89e687 1321 mobj = re.search(
8dfa187b 1322 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
1323 (?P<url>http://(?:(?:embed|www)\.)?
1324 (?:novamov\.com|
1325 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1326 videoweed\.(?:es|com)|
1327 movshare\.(?:net|sx|ag)|
1328 divxstage\.(?:eu|net|ch|co|at|ag))
1329 /embed\.php.+?)\1''', webpage)
8f89e687 1330 if mobj is not None:
15c0e8e7 1331 return self.url_result(mobj.group('url'))
50f56607 1332
9834872b
PH
1333 # Look for embedded Facebook player
1334 mobj = re.search(
db1f3888 1335 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
1336 if mobj is not None:
1337 return self.url_result(mobj.group('url'), 'Facebook')
1338
ca97a56e
S
1339 # Look for embedded VK player
1340 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1341 if mobj is not None:
1342 return self.url_result(mobj.group('url'), 'VK')
1343
0364fa8b
S
1344 # Look for embedded ivi player
1345 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1346 if mobj is not None:
1347 return self.url_result(mobj.group('url'), 'Ivi')
1348
db1f3888
PH
1349 # Look for embedded Huffington Post player
1350 mobj = re.search(
c3f51436 1351 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
1352 if mobj is not None:
1353 return self.url_result(mobj.group('url'), 'HuffPost')
1354
1b86cc41 1355 # Look for embed.ly
1356 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1357 if mobj is not None:
1358 return self.url_result(mobj.group('url'))
1359 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1360 if mobj is not None:
1361 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1362
60cc4dc4
PH
1363 # Look for funnyordie embed
1364 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1365 if matches:
ed2d6a19
PH
1366 return _playlist_from_matches(
1367 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 1368
db546cf8
S
1369 # Look for BBC iPlayer embed
1370 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1371 if matches:
476eae0c 1372 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1373
93d020dd
S
1374 # Look for embedded RUTV player
1375 rutv_url = RUTVIE._extract_url(webpage)
1376 if rutv_url:
1377 return self.url_result(rutv_url, 'RUTV')
1378
494f20cb 1379 # Look for embedded TVC player
b8599718
S
1380 tvc_url = TVCIE._extract_url(webpage)
1381 if tvc_url:
1382 return self.url_result(tvc_url, 'TVC')
494f20cb 1383
d40a3b5b
S
1384 # Look for embedded SportBox player
1385 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1386 if sportbox_urls:
1387 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1388
78e2b74b 1389 # Look for embedded PornHub player
65d161c4
S
1390 pornhub_url = PornHubIE._extract_url(webpage)
1391 if pornhub_url:
1392 return self.url_result(pornhub_url, 'PornHub')
1393
2bb5b6d0
S
1394 # Look for embedded XHamster player
1395 xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
1396 if xhamster_urls:
1397 return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
1398
9872d311
S
1399 # Look for embedded Tvigle player
1400 mobj = re.search(
1401 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
1402 if mobj is not None:
1403 return self.url_result(mobj.group('url'), 'Tvigle')
1404
7e2ede98
JMF
1405 # Look for embedded TED player
1406 mobj = re.search(
d7cc31b6 1407 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1408 if mobj is not None:
1409 return self.url_result(mobj.group('url'), 'TED')
1410
5c386252 1411 # Look for embedded Ustream videos
1412 mobj = re.search(
1413 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1414 if mobj is not None:
1415 return self.url_result(mobj.group('url'), 'Ustream')
1416
893f8832
PH
1417 # Look for embedded arte.tv player
1418 mobj = re.search(
1419 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1420 webpage)
1421 if mobj is not None:
1422 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1423
cb3ac1c6
S
1424 # Look for embedded smotri.com player
1425 smotri_url = SmotriIE._extract_url(webpage)
1426 if smotri_url:
1427 return self.url_result(smotri_url, 'Smotri')
1428
e6c2d9ad 1429 # Look for embedded Myvi.ru player
6dd94d3a 1430 myvi_url = MyviIE._extract_url(webpage)
e6c2d9ad
S
1431 if myvi_url:
1432 return self.url_result(myvi_url)
1433
20991253
PH
1434 # Look for embeded soundcloud player
1435 mobj = re.search(
ac645ac7 1436 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1437 webpage)
1438 if mobj is not None:
1439 url = unescapeHTML(mobj.group('url'))
1440 return self.url_result(url)
1441
826ec77f
PH
1442 # Look for embedded vulture.com player
1443 mobj = re.search(
1444 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1445 webpage)
1446 if mobj is not None:
1447 url = unescapeHTML(mobj.group('url'))
1448 return self.url_result(url, ie='Vulture')
1449
c5cd249e
JMF
1450 # Look for embedded mtvservices player
1451 mobj = re.search(
1452 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1453 webpage)
1454 if mobj is not None:
1455 url = unescapeHTML(mobj.group('url'))
1456 return self.url_result(url, ie='MTVServicesEmbedded')
1457
49807b4a
S
1458 # Look for embedded yahoo player
1459 mobj = re.search(
1460 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1461 webpage)
1462 if mobj is not None:
1463 return self.url_result(mobj.group('url'), 'Yahoo')
1464
2ef6fcb5
PH
1465 # Look for embedded sbs.com.au player
1466 mobj = re.search(
e98b8e79
PH
1467 r'''(?x)
1468 (?:
1469 <meta\s+property="og:video"\s+content=|
1470 <iframe[^>]+?src=
1471 )
1472 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1473 webpage)
1474 if mobj is not None:
1475 return self.url_result(mobj.group('url'), 'SBS')
1476
42bdd9d0
PH
1477 # Look for embedded Cinchcast player
1478 mobj = re.search(
1479 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1480 webpage)
1481 if mobj is not None:
1482 return self.url_result(mobj.group('url'), 'Cinchcast')
1483
1a94ff68 1484 mobj = re.search(
5263cdfc 1485 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68 1486 webpage)
8001607e
YCH
1487 if not mobj:
1488 mobj = re.search(
1489 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1490 webpage)
1a94ff68
S
1491 if mobj is not None:
1492 return self.url_result(mobj.group('url'), 'MLB')
1493
1419fafd
S
1494 mobj = re.search(
1495 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1496 webpage)
1497 if mobj is not None:
1498 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1499
af63fed7
PH
1500 mobj = re.search(
1501 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1502 webpage)
1503 if mobj is not None:
1504 return self.url_result(mobj.group('url'), 'Livestream')
1505
255fca5e
S
1506 # Look for Zapiks embed
1507 mobj = re.search(
1508 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1509 if mobj is not None:
1510 return self.url_result(mobj.group('url'), 'Zapiks')
1511
e3216b82 1512 # Look for Kaltura embeds
66e568de
S
1513 mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
1514 re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
e3216b82
NJ
1515 if mobj is not None:
1516 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1517
135c9c42
S
1518 # Look for Eagle.Platform embeds
1519 mobj = re.search(
1520 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1521 if mobj is not None:
1522 return self.url_result(mobj.group('url'), 'EaglePlatform')
1523
d47ae7f6
S
1524 # Look for ClipYou (uses Eagle.Platform) embeds
1525 mobj = re.search(
1526 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1527 if mobj is not None:
1528 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1529
f8388757
S
1530 # Look for Pladform embeds
1531 mobj = re.search(
1532 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1533 if mobj is not None:
1534 return self.url_result(mobj.group('url'), 'Pladform')
1535
2dcc114f
S
1536 # Look for Playwire embeds
1537 mobj = re.search(
1538 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1539 if mobj is not None:
1540 return self.url_result(mobj.group('url'))
1541
ad320e9b
NJ
1542 # Look for 5min embeds
1543 mobj = re.search(
1544 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1545 if mobj is not None:
1546 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1547
18153f1b
S
1548 # Look for Crooks and Liars embeds
1549 mobj = re.search(
1550 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1551 if mobj is not None:
1552 return self.url_result(mobj.group('url'))
1553
a2edf2e7
YCH
1554 # Look for NBC Sports VPlayer embeds
1555 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1556 if nbc_sports_url:
1557 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1558
418c5cc3
YCH
1559 # Look for UDN embeds
1560 mobj = re.search(
1561 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1562 if mobj is not None:
1563 return self.url_result(
0a160363 1564 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
418c5cc3 1565
2fe1b5bd
YCH
1566 # Look for Senate ISVP iframe
1567 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1568 if senate_isvp_url:
25c3a734 1569 return self.url_result(senate_isvp_url, 'SenateISVP')
2fe1b5bd 1570
756f574e
YCH
1571 # Look for Dailymotion Cloud videos
1572 dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
1573 if dmcloud_url:
1574 return self.url_result(dmcloud_url, 'DailymotionCloud')
1575
1ac1c4c2
S
1576 # Look for OnionStudios embeds
1577 onionstudios_url = OnionStudiosIE._extract_url(webpage)
1578 if onionstudios_url:
1579 return self.url_result(onionstudios_url)
1580
eedd20ef
S
1581 # Look for SnagFilms embeds
1582 snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
1583 if snagfilms_url:
1584 return self.url_result(snagfilms_url)
1585
a5158f38
YCH
1586 # Look for AdobeTVVideo embeds
1587 mobj = re.search(
1588 r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
1589 webpage)
1590 if mobj is not None:
1591 return self.url_result(
1592 self._proto_relative_url(unescapeHTML(mobj.group(1))),
1593 'AdobeTVVideo')
1594
ced659bb 1595 def check_video(vurl):
a0f71985
PH
1596 if YoutubeIE.suitable(vurl):
1597 return True
ced659bb
S
1598 vpath = compat_urlparse.urlparse(vurl).path
1599 vext = determine_ext(vpath)
1600 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1601
1602 def filter_video(urls):
1603 return list(filter(check_video, urls))
1604
9b122384 1605 # Start with something easy: JW Player in SWFObject
ced659bb 1606 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1607 if not found:
d981cef6 1608 # Look for gorilla-vid style embedding
ced659bb 1609 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1610 (?:
1611 jw_plugins|
1612 JWPlayerOptions|
1613 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1614 )
a0f71985
PH
1615 .*?
1616 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1617 if not found:
9b122384 1618 # Broaden the search a little bit
ced659bb 1619 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1620 if not found:
1621 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1622 found = filter_video(re.findall(
1623 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1624 if not found:
1625 # Flow player
ced659bb 1626 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1627 flowplayer\("[^"]+",\s*
1628 \{[^}]+?\}\s*,
52585fd6 1629 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1630 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1631 ''', webpage))
501f13fb
PH
1632 if not found:
1633 # Cinerama player
1634 found = re.findall(
1635 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1636 if not found:
9b122384 1637 # Try to find twitter cards info
ced659bb
S
1638 found = filter_video(re.findall(
1639 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1640 if not found:
9b122384
PH
1641 # We look for Open Graph info:
1642 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1643 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1644 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1645 if m_video_type is not None:
ced659bb 1646 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1647 if not found:
7fea7156 1648 # HTML5 video
9b32eca3 1649 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1650 if not found:
ed9a25dd 1651 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
a5a45015 1652 found = re.search(
89ef304b 1653 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
ed9a25dd 1654 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
89ef304b 1655 webpage)
84f81016
S
1656 if not found:
1657 # Look also in Refresh HTTP header
1658 refresh_header = head_response.headers.get('Refresh')
1659 if refresh_header:
ed9a25dd 1660 found = re.search(REDIRECT_REGEX, refresh_header)
b30b8698 1661 if found:
406224be 1662 new_url = compat_urlparse.urljoin(url, found.group(1))
89ef304b
PH
1663 self.report_following_redirect(new_url)
1664 return {
1665 '_type': 'url',
1666 'url': new_url,
1667 }
b30b8698 1668 if not found:
416c7fcb 1669 raise UnsupportedError(url)
9b122384 1670
b30b8698
PH
1671 entries = []
1672 for video_url in found:
1673 video_url = compat_urlparse.urljoin(url, video_url)
1674 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1675
b30b8698
PH
1676 # Sometimes, jwplayer extraction will result in a YouTube URL
1677 if YoutubeIE.suitable(video_url):
1678 entries.append(self.url_result(video_url, 'Youtube'))
1679 continue
9b122384 1680
b30b8698
PH
1681 # here's a fun little line of code for you:
1682 video_id = os.path.splitext(video_id)[0]
fc9713a1 1683
d6fd958c
YCH
1684 if determine_ext(video_url) == 'smil':
1685 entries.append({
1686 'id': video_id,
1687 'formats': self._extract_smil_formats(video_url, video_id),
1688 'uploader': video_uploader,
1689 'title': video_title,
1690 'age_limit': age_limit,
1691 })
1692 else:
1693 entries.append({
1694 'id': video_id,
1695 'url': video_url,
1696 'uploader': video_uploader,
1697 'title': video_title,
1698 'age_limit': age_limit,
1699 })
b30b8698
PH
1700
1701 if len(entries) == 1:
669f0e7c 1702 return entries[0]
b30b8698
PH
1703 else:
1704 for num, e in enumerate(entries, start=1):
13d8fbef
JMF
1705 # 'url' results don't have a title
1706 if e.get('title') is not None:
1707 e['title'] = '%s (%d)' % (e['title'], num)
b30b8698
PH
1708 return {
1709 '_type': 'playlist',
1710 'entries': entries,
1711 }