]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/generic.py
[bandcamp:album] Fix extractor results and associated test
[yt-dlp.git] / youtube_dl / extractor / generic.py
CommitLineData
cfe50f04
JMF
1# encoding: utf-8
2
79649588
PH
3from __future__ import unicode_literals
4
9b122384
PH
5import os
6import re
7
8from .common import InfoExtractor
fc9713a1 9from .youtube import YoutubeIE
8c25f81b 10from ..compat import (
9b122384 11 compat_urllib_parse,
a5caba1e 12 compat_urlparse,
f7300c5c 13 compat_xml_parse_error,
8c25f81b
PH
14)
15from ..utils import (
b759a0d4 16 determine_ext,
9b122384 17 ExtractorError,
c8e9a235 18 float_or_none,
aa94a6d3 19 HEADRequest,
61ca9a80 20 is_html,
ed2d6a19 21 orderedSet,
bcf89ce6 22 parse_xml,
9d4660ca
PH
23 smuggle_url,
24 unescapeHTML,
42393ce2 25 unified_strdate,
4d54ef20 26 unsmuggle_url,
416c7fcb 27 UnsupportedError,
42393ce2 28 url_basename,
9b122384 29)
cfe50f04 30from .brightcove import BrightcoveIE
c0d0b01f 31from .ooyala import OoyalaIE
93d020dd 32from .rutv import RUTVIE
cb3ac1c6 33from .smotri import SmotriIE
1419fafd 34from .condenast import CondeNastIE
9b122384 35
0838239e 36
9b122384 37class GenericIE(InfoExtractor):
79649588 38 IE_DESC = 'Generic downloader that works on some sites'
9b122384 39 _VALID_URL = r'.*'
79649588 40 IE_NAME = 'generic'
cfe50f04
JMF
41 _TESTS = [
42 {
79649588 43 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
d360a146 44 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
79649588 45 'info_dict': {
d360a146
S
46 'id': '13601338388002',
47 'ext': 'mp4',
79649588
PH
48 'uploader': 'www.hodiho.fr',
49 'title': 'R\u00e9gis plante sa Jeep',
cfe50f04
JMF
50 }
51 },
c19f7764
JMF
52 # bandcamp page with custom domain
53 {
79649588
PH
54 'add_ie': ['Bandcamp'],
55 'url': 'http://bronyrock.com/track/the-pony-mash',
79649588 56 'info_dict': {
fd50bf62
S
57 'id': '3235767654',
58 'ext': 'mp3',
79649588
PH
59 'title': 'The Pony Mash',
60 'uploader': 'M_Pallante',
c19f7764 61 },
79649588 62 'skip': 'There is a limit of 200 free downloads / month for the test song',
c19f7764 63 },
eeb165e6 64 # embedded brightcove video
dd5bcdc4
JMF
65 # it also tests brightcove videos that need to set the 'Referer' in the
66 # http requests
eeb165e6 67 {
79649588
PH
68 'add_ie': ['Brightcove'],
69 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
70 'info_dict': {
71 'id': '2765128793001',
72 'ext': 'mp4',
73 'title': 'Le cours de bourse : l’analyse technique',
74 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
75 'uploader': 'BFM BUSINESS',
eeb165e6 76 },
79649588
PH
77 'params': {
78 'skip_download': True,
eeb165e6
JMF
79 },
80 },
17ab4d3b
PH
81 {
82 # https://github.com/rg3/youtube-dl/issues/2253
83 'url': 'http://bcove.me/i6nfkrc3',
17ab4d3b
PH
84 'md5': '0ba9446db037002366bab3b3eb30c88c',
85 'info_dict': {
fd50bf62
S
86 'id': '3101154703001',
87 'ext': 'mp4',
17ab4d3b
PH
88 'title': 'Still no power',
89 'uploader': 'thestar.com',
90 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
91 },
92 'add_ie': ['Brightcove'],
93 },
0479c625
S
94 {
95 'url': 'http://www.championat.com/video/football/v/87/87499.html',
96 'md5': 'fb973ecf6e4a78a67453647444222983',
97 'info_dict': {
98 'id': '3414141473001',
99 'ext': 'mp4',
100 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
101 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
102 'uploader': 'Championat',
103 },
104 },
bdf97017 105 {
37aab278 106 # https://github.com/rg3/youtube-dl/issues/3541
bdf97017
NJ
107 'add_ie': ['Brightcove'],
108 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
109 'info_dict': {
110 'id': '3866516442001',
37aab278 111 'ext': 'mp4',
bdf97017
NJ
112 'title': 'Leer mij vrouwen kennen: Aflevering 1',
113 'description': 'Leer mij vrouwen kennen: Aflevering 1',
114 'uploader': 'SBS Broadcasting',
115 },
37aab278 116 'skip': 'Restricted to Netherlands',
bdf97017 117 'params': {
37aab278 118 'skip_download': True, # m3u8 download
bdf97017
NJ
119 },
120 },
42393ce2
PH
121 # Direct link to a video
122 {
79649588 123 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
79649588
PH
124 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
125 'info_dict': {
126 'id': 'trailer',
89ef304b 127 'ext': 'mp4',
79649588
PH
128 'title': 'trailer',
129 'upload_date': '20100513',
42393ce2 130 }
c0d0b01f
JMF
131 },
132 # ooyala video
133 {
79649588 134 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
87830900 135 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
79649588
PH
136 'info_dict': {
137 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
138 'ext': 'mp4',
3486df38 139 'title': '2cc213299525360.mov', # that's what we get
c0d0b01f 140 },
87830900 141 'add_ie': ['Ooyala'],
c0d0b01f 142 },
f076b638 143 # multiple ooyala embeds on SBN network websites
144 {
145 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
146 'info_dict': {
147 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
148 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
149 },
150 'playlist_mincount': 3,
151 'params': {
152 'skip_download': True,
153 },
154 'add_ie': ['Ooyala'],
155 },
89ef304b
PH
156 # google redirect
157 {
158 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
159 'info_dict': {
160 'id': 'cmQHVoWB5FY',
161 'ext': 'mp4',
162 'upload_date': '20130224',
163 'uploader_id': 'TheVerge',
87830900 164 'description': 're:^Chris Ziegler takes a look at the\.*',
89ef304b
PH
165 'uploader': 'The Verge',
166 'title': 'First Firefox OS phones side-by-side',
167 },
168 'params': {
169 'skip_download': False,
170 }
f55a1f0a 171 },
1b86cc41 172 # embed.ly video
173 {
174 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
175 'info_dict': {
176 'id': '9ODmcdjQcHQ',
177 'ext': 'mp4',
0a5bce56
PH
178 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
179 'upload_date': '20140225',
180 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
181 'uploader': 'Tested',
182 'uploader_id': 'testedcom',
1b86cc41 183 },
184 # No need to test YoutubeIE here
185 'params': {
186 'skip_download': True,
187 },
188 },
60cc4dc4
PH
189 # funnyordie embed
190 {
191 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
60cc4dc4
PH
192 'info_dict': {
193 'id': '18e820ec3f',
194 'ext': 'mp4',
195 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
196 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
93d020dd 197 },
60cc4dc4 198 },
faa4ea68
S
199 # BBC iPlayer embeds
200 {
201 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
202 'info_dict': {
203 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
204 },
205 'playlist_mincount': 18,
206 },
93d020dd
S
207 # RUTV embed
208 {
209 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
210 'info_dict': {
211 'id': '776940',
212 'ext': 'mp4',
213 'title': 'Охотское море стало целиком российским',
214 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
215 },
216 'params': {
217 # m3u8 download
218 'skip_download': True,
219 },
aab74fa1
PH
220 },
221 # Embedded TED video
222 {
223 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
a8eb5a8e 224 'md5': '65fdff94098e4a607385a60c5177c638',
aab74fa1 225 'info_dict': {
a8eb5a8e 226 'id': '1969',
aab74fa1 227 'ext': 'mp4',
a8eb5a8e
PH
228 'title': 'Hidden miracles of the natural world',
229 'uploader': 'Louie Schwartzberg',
230 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
aab74fa1 231 }
60cc4dc4 232 },
5c386252 233 # Embeded Ustream video
234 {
235 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
236 'md5': '27b99cdb639c9b12a79bca876a073417',
237 'info_dict': {
ca6aada4 238 'id': '45734260',
239 'ext': 'flv',
240 'uploader': 'AU SPA: The NSA and Privacy',
5c386252 241 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
242 }
243 },
d95e35d6
S
244 # nowvideo embed hidden behind percent encoding
245 {
246 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
247 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
248 'info_dict': {
249 'id': '06e53103ca9aa',
250 'ext': 'flv',
251 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
252 'description': 'No description',
253 },
0f2a2ba1 254 },
893f8832
PH
255 # arte embed
256 {
257 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
258 'md5': '7653032cbb25bf6c80d80f217055fa43',
259 'info_dict': {
260 'id': '048195-004_PLUS7-F',
261 'ext': 'flv',
262 'title': 'X:enius',
263 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
264 'upload_date': '20140320',
265 },
266 'params': {
267 'skip_download': 'Requires rtmpdump'
268 }
269 },
fa35cdad
PH
270 # Condé Nast embed
271 {
272 'url': 'http://www.wired.com/2014/04/honda-asimo/',
273 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
274 'info_dict': {
275 'id': '53501be369702d3275860000',
276 'ext': 'mp4',
277 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
278 }
ebd3c7b3
PH
279 },
280 # Dailymotion embed
281 {
282 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
283 'md5': '441aeeb82eb72c422c7f14ec533999cd',
284 'info_dict': {
285 'id': 'k2mm4bCdJ6CQ2i7c8o2',
286 'ext': 'mp4',
287 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
288 'uploader': 'Spi0n',
289 },
290 'add_ie': ['Dailymotion'],
2b88feed
PH
291 },
292 # YouTube embed
293 {
294 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
295 'info_dict': {
296 'id': 'FXRb4ykk4S0',
297 'ext': 'mp4',
298 'title': 'The NBL Auction 2014',
299 'uploader': 'BADMINTON England',
300 'uploader_id': 'BADMINTONEvents',
301 'upload_date': '20140603',
302 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
303 },
304 'add_ie': ['Youtube'],
305 'params': {
306 'skip_download': True,
307 }
308 },
c5cd249e
JMF
309 # MTVSercices embed
310 {
311 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
312 'md5': '35727f82f58c76d996fc188f9755b0d5',
313 'info_dict': {
314 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
315 'ext': 'mp4',
316 'title': 'Review',
317 'description': 'Mario\'s life in the fast lane has never looked so good.',
318 },
319 },
61013473 320 # YouTube embed via <data-embed-url="">
321 {
322 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
61013473 323 'info_dict': {
a8eb5a8e 324 'id': '4vAffPZIT44',
61013473 325 'ext': 'mp4',
a8eb5a8e 326 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
ed2d6a19
PH
327 'uploader': 'Gameloft',
328 'uploader_id': 'gameloft',
a8eb5a8e
PH
329 'upload_date': '20140828',
330 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
ed2d6a19
PH
331 },
332 'params': {
333 'skip_download': True,
61013473 334 }
c8e9a235
PH
335 },
336 # Camtasia studio
337 {
338 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
339 'playlist': [{
340 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
341 'info_dict': {
342 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
343 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
344 'ext': 'flv',
345 'duration': 2235.90,
346 }
347 }, {
348 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
349 'info_dict': {
350 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
351 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
352 'ext': 'flv',
353 'duration': 2235.93,
354 }
355 }],
356 'info_dict': {
357 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
358 }
4d805e06
PH
359 },
360 # Flowplayer
361 {
362 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
363 'md5': '9d65602bf31c6e20014319c7d07fba27',
364 'info_dict': {
365 'id': '5123ea6d5e5a7',
366 'ext': 'mp4',
367 'age_limit': 18,
368 'uploader': 'www.handjobhub.com',
d6d9186f 369 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
4d805e06 370 }
0990305d
PH
371 },
372 # RSS feed
373 {
374 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
375 'info_dict': {
376 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
377 'title': 'Zero Punctuation',
b1b0b1ca 378 'description': 're:.*groundbreaking video review series.*'
0990305d
PH
379 },
380 'playlist_mincount': 11,
22a6f150
PH
381 },
382 # Multiple brightcove videos
383 # https://github.com/rg3/youtube-dl/issues/2283
384 {
385 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
386 'info_dict': {
387 'id': 'always-never',
388 'title': 'Always / Never - The New Yorker',
389 },
390 'playlist_count': 3,
391 'params': {
392 'extract_flat': False,
393 'skip_download': True,
394 }
1a94ff68
S
395 },
396 # MLB embed
397 {
398 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
399 'md5': '96f09a37e44da40dd083e12d9a683327',
400 'info_dict': {
401 'id': '33322633',
402 'ext': 'mp4',
403 'title': 'Ump changes call to ball',
404 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
405 'duration': 48,
406 'timestamp': 1401537900,
407 'upload_date': '20140531',
408 'thumbnail': 're:^https?://.*\.jpg$',
409 },
410 },
746c67d7
NJ
411 # Wistia embed
412 {
413 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
414 'md5': '8788b683c777a5cf25621eaf286d0c23',
415 'info_dict': {
416 'id': '1cfaf6b7ea',
417 'ext': 'mov',
418 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
419 'duration': 643.0,
420 'filesize': 182808282,
421 'uploader': 'education-portal.com',
422 },
423 },
52cffcb1 424 {
425 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
426 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
427 'info_dict': {
428 'id': 'uxjb0lwrcz',
429 'ext': 'mp4',
85d7b765 430 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
52cffcb1 431 'duration': 1715.0,
85d7b765 432 'uploader': 'thoughtworks.wistia.com',
70b7e3fb 433 },
52cffcb1 434 },
70b7e3fb
PH
435 # Direct download with broken HEAD
436 {
437 'url': 'http://ai-radio.org:8000/radio.opus',
438 'info_dict': {
439 'id': 'radio',
440 'ext': 'opus',
441 'title': 'radio',
442 },
443 'params': {
444 'skip_download': True, # infinite live stream
445 },
446 'expected_warnings': [
447 r'501.*Not Implemented'
448 ],
ac645ac7
PH
449 },
450 # Soundcloud embed
451 {
452 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
453 'info_dict': {
454 'id': '174391317',
455 'ext': 'mp3',
456 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
457 'uploader': 'Sophos Security',
458 'title': 'Chet Chat 171 - Oct 29, 2014',
459 'upload_date': '20141029',
460 }
af63fed7
PH
461 },
462 # Livestream embed
463 {
464 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
465 'info_dict': {
466 'id': '67864563',
467 'ext': 'flv',
468 'upload_date': '20141112',
469 'title': 'Rosetta #CometLanding webcast HL 10',
470 }
471 },
65f3a228
PH
472 # LazyYT
473 {
474 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
475 'info_dict': {
476 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
477 },
478 'playlist_mincount': 2,
4e262a88
PH
479 },
480 # Direct link with incorrect MIME type
481 {
482 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
483 'md5': '4ccbebe5f36706d85221f204d7eb5913',
484 'info_dict': {
485 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
486 'id': '5_Lennart_Poettering_-_Systemd',
487 'ext': 'webm',
488 'title': '5_Lennart_Poettering_-_Systemd',
489 'upload_date': '20141120',
490 },
491 'expected_warnings': [
492 'URL could be a direct video link, returning it as such.'
493 ]
42bdd9d0
PH
494 },
495 # Cinchcast embed
496 {
497 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
498 'info_dict': {
499 'id': '7141703',
500 'ext': 'mp3',
501 'upload_date': '20141126',
502 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
503 }
504 },
501f13fb
PH
505 # Cinerama player
506 {
507 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
508 'info_dict': {
509 'id': '730m_DandD_1901_512k',
510 'ext': 'mp4',
511 'uploader': 'www.abc.net.au',
512 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
513 }
796df3c6
S
514 },
515 # embedded viddler video
516 {
517 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
518 'info_dict': {
519 'id': '4d03aad9',
520 'ext': 'mp4',
521 'uploader': 'deadspin',
522 'title': 'WALL-TO-GORTAT',
523 'timestamp': 1422285291,
524 'upload_date': '20150126',
525 },
526 'add_ie': ['Viddler'],
a0f71985
PH
527 },
528 # jwplayer YouTube
529 {
530 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
531 'info_dict': {
532 'id': 'Mrj4DVp2zeA',
533 'ext': 'mp4',
534 'upload_date': '20150204',
535 'uploader': 'The National Archives UK',
536 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
537 'uploader_id': 'NationalArchives08',
538 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
539 },
59b8ab58
PH
540 },
541 # rtl.nl embed
542 {
543 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
544 'playlist_mincount': 5,
545 'info_dict': {
546 'id': 'aanslagen-kopenhagen',
547 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
548 }
501f13fb 549 }
cfe50f04 550 ]
9b122384 551
9b122384
PH
552 def report_following_redirect(self, new_url):
553 """Report information extraction."""
79649588 554 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
9b122384 555
4fc946b5
PH
556 def _extract_rss(self, url, video_id, doc):
557 playlist_title = doc.find('./channel/title').text
558 playlist_desc_el = doc.find('./channel/description')
559 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
560
561 entries = [{
562 '_type': 'url',
563 'url': e.find('link').text,
564 'title': e.find('title').text,
565 } for e in doc.findall('./channel/item')]
566
567 return {
568 '_type': 'playlist',
569 'id': url,
570 'title': playlist_title,
571 'description': playlist_desc,
572 'entries': entries,
573 }
574
c8e9a235
PH
575 def _extract_camtasia(self, url, video_id, webpage):
576 """ Returns None if no camtasia video can be found. """
577
578 camtasia_cfg = self._search_regex(
579 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
580 webpage, 'camtasia configuration file', default=None)
581 if camtasia_cfg is None:
582 return None
583
584 title = self._html_search_meta('DC.title', webpage, fatal=True)
585
586 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
587 camtasia_cfg = self._download_xml(
588 camtasia_url, video_id,
589 note='Downloading camtasia configuration',
590 errnote='Failed to download camtasia configuration')
591 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
592
593 entries = []
594 for n in fileset_node.getchildren():
595 url_n = n.find('./uri')
596 if url_n is None:
597 continue
598
599 entries.append({
600 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
601 'title': '%s - %s' % (title, n.tag),
602 'url': compat_urlparse.urljoin(url, url_n.text),
603 'duration': float_or_none(n.find('./duration').text),
604 })
605
606 return {
607 '_type': 'playlist',
608 'entries': entries,
609 'title': title,
610 }
611
9b122384 612 def _real_extract(self, url):
ebd3c7b3
PH
613 if url.startswith('//'):
614 return {
615 '_type': 'url',
20991253 616 'url': self.http_scheme() + url,
ebd3c7b3
PH
617 }
618
a7130543
JMF
619 parsed_url = compat_urlparse.urlparse(url)
620 if not parsed_url.scheme:
04b4d394
PH
621 default_search = self._downloader.params.get('default_search')
622 if default_search is None:
1f7ccb90 623 default_search = 'fixup_error'
04b4d394 624
1f7ccb90 625 if default_search in ('auto', 'auto_warning', 'fixup_error'):
04b4d394
PH
626 if '/' in url:
627 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
628 return self.url_result('http://' + url)
1f7ccb90 629 elif default_search != 'fixup_error':
9c1fc022 630 if default_search == 'auto_warning':
0e67ab0d
PH
631 if re.match(r'^(?:url|URL)$', url):
632 raise ExtractorError(
633 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
634 expected=True)
635 else:
636 self._downloader.report_warning(
7571c02c 637 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
04b4d394 638 return self.url_result('ytsearch:' + url)
1f7ccb90
PH
639
640 if default_search in ('error', 'fixup_error'):
7571c02c 641 raise ExtractorError(
b74e86f4
PH
642 '%r is not a valid URL. '
643 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
644 % (url, url), expected=True)
04b4d394 645 else:
f2f2c0c2
PH
646 if ':' not in default_search:
647 default_search += ':'
04b4d394 648 return self.url_result(default_search + url)
4d54ef20
PH
649
650 url, smuggled_data = unsmuggle_url(url)
651 force_videoid = None
d6e6a422 652 is_intentional = smuggled_data and smuggled_data.get('to_generic')
4d54ef20
PH
653 if smuggled_data and 'force_videoid' in smuggled_data:
654 force_videoid = smuggled_data['force_videoid']
655 video_id = force_videoid
656 else:
657 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
a7130543 658
79649588 659 self.to_screen('%s: Requesting header' % video_id)
c1d1facd 660
ebab4520 661 head_req = HEADRequest(url)
23be51d8 662 head_response = self._request_webpage(
ebab4520
PH
663 head_req, video_id,
664 note=False, errnote='Could not send HEAD request to %s' % url,
665 fatal=False)
42393ce2 666
23be51d8 667 if head_response is not False:
42393ce2 668 # Check for redirect
23be51d8 669 new_url = head_response.geturl()
42393ce2
PH
670 if url != new_url:
671 self.report_following_redirect(new_url)
4d54ef20
PH
672 if force_videoid:
673 new_url = smuggle_url(
674 new_url, {'force_videoid': force_videoid})
cecaaf3f 675 return self.url_result(new_url)
42393ce2 676
23be51d8
PH
677 full_response = None
678 if head_response is False:
679 full_response = self._request_webpage(url, video_id)
680 head_response = full_response
681
682 # Check for direct link to a video
683 content_type = head_response.headers.get('Content-Type', '')
684 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
685 if m:
686 upload_date = unified_strdate(
687 head_response.headers.get('Last-Modified'))
688 return {
689 'id': video_id,
690 'title': os.path.splitext(url_basename(url))[0],
ccdd0ffb 691 'direct': True,
23be51d8
PH
692 'formats': [{
693 'format_id': m.group('format_id'),
694 'url': url,
695 'vcodec': 'none' if m.group('type') == 'audio' else None
696 }],
697 'upload_date': upload_date,
698 }
42393ce2 699
d6e6a422
PH
700 if not self._downloader.params.get('test', False) and not is_intentional:
701 self._downloader.report_warning('Falling back on generic information extractor.')
702
4e262a88
PH
703 if not full_response:
704 full_response = self._request_webpage(url, video_id)
705
706 # Maybe it's a direct link to a video?
707 # Be careful not to download the whole thing!
708 first_bytes = full_response.read(512)
61ca9a80 709 if not is_html(first_bytes):
4e262a88
PH
710 self._downloader.report_warning(
711 'URL could be a direct video link, returning it as such.')
712 upload_date = unified_strdate(
713 head_response.headers.get('Last-Modified'))
714 return {
715 'id': video_id,
716 'title': os.path.splitext(url_basename(url))[0],
717 'direct': True,
718 'url': url,
719 'upload_date': upload_date,
720 }
721
722 webpage = self._webpage_read_content(
723 full_response, url, video_id, prefix=first_bytes)
724
9b122384 725 self.report_extraction(video_id)
887c6acd 726
4fc946b5
PH
727 # Is it an RSS feed?
728 try:
bcf89ce6 729 doc = parse_xml(webpage)
4fc946b5
PH
730 if doc.tag == 'rss':
731 return self._extract_rss(url, video_id, doc)
f7300c5c 732 except compat_xml_parse_error:
4fc946b5
PH
733 pass
734
c8e9a235
PH
735 # Is it a Camtasia project?
736 camtasia_res = self._extract_camtasia(url, video_id, webpage)
737 if camtasia_res is not None:
738 return camtasia_res
739
14390730
S
740 # Sometimes embedded video player is hidden behind percent encoding
741 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
742 # Unescaping the whole page allows to handle those cases in a generic way
1f7659db
S
743 webpage = compat_urllib_parse.unquote(webpage)
744
887c6acd
PH
745 # it's tempting to parse this further, but you would
746 # have to take into account all the variations like
747 # Video Title - Site Name
748 # Site Name | Video Title
749 # Video Title - Tagline | Site Name
750 # and so on and so forth; it's just not practical
ef4fd848 751 video_title = self._html_search_regex(
79649588
PH
752 r'(?s)<title>(.*?)</title>', webpage, 'video title',
753 default='video')
ef4fd848 754
4d805e06
PH
755 # Try to detect age limit automatically
756 age_limit = self._rta_search(webpage)
757 # And then there are the jokers who advertise that they use RTA,
758 # but actually don't.
759 AGE_LIMIT_MARKERS = [
760 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
761 ]
762 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
763 age_limit = 18
764
ef4fd848
PH
765 # video uploader is domain name
766 video_uploader = self._search_regex(
79649588 767 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887c6acd 768
ed2d6a19 769 # Helper method
83992676 770 def _playlist_from_matches(matches, getter=None, ie=None):
3b2f933b 771 urlrs = orderedSet(
83992676 772 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
3b2f933b 773 for m in matches)
ed2d6a19
PH
774 return self.playlist_result(
775 urlrs, playlist_id=video_id, playlist_title=video_title)
776
627a91a9 777 # Look for BrightCove:
99877772
PH
778 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
779 if bc_urls:
79649588 780 self.to_screen('Brightcove video detected.')
99877772
PH
781 entries = [{
782 '_type': 'url',
783 'url': smuggle_url(bc_url, {'Referer': url}),
784 'ie_key': 'Brightcove'
785 } for bc_url in bc_urls]
786
787 return {
788 '_type': 'playlist',
789 'title': video_title,
790 'id': video_id,
791 'entries': entries,
792 }
cfe50f04 793
59b8ab58
PH
794 # Look for embedded rtl.nl player
795 matches = re.findall(
796 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
797 webpage)
798 if matches:
799 return _playlist_from_matches(matches, ie='RtlNl')
800
7115ca84 801 # Look for embedded (iframe) Vimeo player
9d4660ca 802 mobj = re.search(
15fd51b3 803 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
9d4660ca 804 if mobj:
15fd51b3 805 player_url = unescapeHTML(mobj.group('url'))
9d4660ca 806 surl = smuggle_url(player_url, {'Referer': url})
09a42738 807 return self.url_result(surl)
7115ca84
PH
808 # Look for embedded (swf embed) Vimeo player
809 mobj = re.search(
09a42738 810 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
7115ca84 811 if mobj:
09a42738 812 return self.url_result(mobj.group(1))
7115ca84 813
53c1d3ef 814 # Look for embedded YouTube player
1f9da904 815 matches = re.findall(r'''(?x)
2b88feed
PH
816 (?:
817 <iframe[^>]+?src=|
c71dfccc 818 data-video-url=|
2b88feed 819 <embed[^>]+?src=|
a7e97f6d
PH
820 embedSWF\(?:\s*|
821 new\s+SWFObject\(
2b88feed
PH
822 )
823 (["\'])
1bf5423e 824 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
6b08cdf6 825 (?:embed|v|p)/.+?)
1f9da904 826 \1''', webpage)
887c6acd 827 if matches:
ed2d6a19 828 return _playlist_from_matches(
3b2f933b 829 matches, lambda m: unescapeHTML(m[1]))
53c1d3ef 830
65f3a228
PH
831 # Look for lazyYT YouTube embed
832 matches = re.findall(
833 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
834 if matches:
835 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
836
355e4fd0
PH
837 # Look for embedded Dailymotion player
838 matches = re.findall(
ef4fd848 839 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
355e4fd0 840 if matches:
ed2d6a19
PH
841 return _playlist_from_matches(
842 matches, lambda m: unescapeHTML(m[1]))
355e4fd0 843
8489578d
NJ
844 # Look for embedded Dailymotion playlist player (#3822)
845 m = re.search(
846 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
847 if m:
848 playlists = re.findall(
849 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
850 if playlists:
851 return _playlist_from_matches(
852 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
853
ef4fd848
PH
854 # Look for embedded Wistia player
855 match = re.search(
281d3f1d 856 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
ef4fd848 857 if match:
9471c444
NJ
858 embed_url = self._proto_relative_url(
859 unescapeHTML(match.group('url')))
ef4fd848
PH
860 return {
861 '_type': 'url_transparent',
9471c444 862 'url': embed_url,
ef4fd848
PH
863 'ie_key': 'Wistia',
864 'uploader': video_uploader,
865 'title': video_title,
866 'id': video_id,
867 }
5f6a1245 868
9471c444 869 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
746c67d7
NJ
870 if match:
871 return {
872 '_type': 'url_transparent',
873 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
874 'ie_key': 'Wistia',
875 'uploader': video_uploader,
876 'title': video_title,
877 'id': match.group('id')
878 }
ef4fd848 879
ee3e63e4 880 # Look for embedded blip.tv player
19dab5e6 881 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
ee3e63e4 882 if mobj:
2514d263 883 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1f8b6af7 884 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
ee3e63e4 885 if mobj:
19dab5e6 886 return self.url_result(mobj.group(1), 'BlipTV')
ee3e63e4 887
fa35cdad
PH
888 # Look for embedded condenast player
889 matches = re.findall(
890 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
891 webpage)
892 if matches:
893 return {
894 '_type': 'playlist',
895 'entries': [{
896 '_type': 'url',
897 'ie_key': 'CondeNast',
898 'url': ma,
899 } for ma in matches],
900 'title': video_title,
901 'id': video_id,
902 }
903
c19f7764
JMF
904 # Look for Bandcamp pages with custom domain
905 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
906 if mobj is not None:
907 burl = unescapeHTML(mobj.group(1))
09804265
JMF
908 # Don't set the extractor because it can be a track url or an album
909 return self.url_result(burl)
c19f7764 910
f25571ff
PH
911 # Look for embedded Vevo player
912 mobj = re.search(
913 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
914 if mobj is not None:
915 return self.url_result(mobj.group('url'))
796df3c6
S
916
917 # Look for embedded Viddler player
cb454b33
S
918 mobj = re.search(
919 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
920 webpage)
796df3c6
S
921 if mobj is not None:
922 return self.url_result(mobj.group('url'))
f25571ff 923
c0d0b01f 924 # Look for Ooyala videos
cb454b33 925 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
f076b638 926 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
927 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
c0d0b01f 928 if mobj is not None:
750f9020 929 return OoyalaIE._build_url_result(mobj.group('ec'))
c0d0b01f 930
f076b638 931 # Look for multiple Ooyala embeds on SBN network websites
932 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
933 if mobj is not None:
934 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
935 if embeds:
936 return _playlist_from_matches(
937 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
938
aa94a6d3 939 # Look for Aparat videos
48099643 940 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
aa94a6d3
PH
941 if mobj is not None:
942 return self.url_result(mobj.group(1), 'Aparat')
943
c93c2ab1 944 # Look for MPORA videos
c3f51436 945 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
c93c2ab1
PH
946 if mobj is not None:
947 return self.url_result(mobj.group(1), 'Mpora')
5f59ee79 948
15c0e8e7 949 # Look for embedded NovaMov-based player
8f89e687 950 mobj = re.search(
8dfa187b 951 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
15c0e8e7
S
952 (?P<url>http://(?:(?:embed|www)\.)?
953 (?:novamov\.com|
954 nowvideo\.(?:ch|sx|eu|at|ag|co)|
955 videoweed\.(?:es|com)|
956 movshare\.(?:net|sx|ag)|
957 divxstage\.(?:eu|net|ch|co|at|ag))
958 /embed\.php.+?)\1''', webpage)
8f89e687 959 if mobj is not None:
15c0e8e7 960 return self.url_result(mobj.group('url'))
50f56607 961
9834872b
PH
962 # Look for embedded Facebook player
963 mobj = re.search(
db1f3888 964 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
9834872b
PH
965 if mobj is not None:
966 return self.url_result(mobj.group('url'), 'Facebook')
967
ca97a56e
S
968 # Look for embedded VK player
969 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
970 if mobj is not None:
971 return self.url_result(mobj.group('url'), 'VK')
972
0364fa8b
S
973 # Look for embedded ivi player
974 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
975 if mobj is not None:
976 return self.url_result(mobj.group('url'), 'Ivi')
977
db1f3888
PH
978 # Look for embedded Huffington Post player
979 mobj = re.search(
c3f51436 980 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
db1f3888
PH
981 if mobj is not None:
982 return self.url_result(mobj.group('url'), 'HuffPost')
983
1b86cc41 984 # Look for embed.ly
985 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
986 if mobj is not None:
987 return self.url_result(mobj.group('url'))
988 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
989 if mobj is not None:
990 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
991
60cc4dc4
PH
992 # Look for funnyordie embed
993 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
994 if matches:
ed2d6a19
PH
995 return _playlist_from_matches(
996 matches, getter=unescapeHTML, ie='FunnyOrDie')
60cc4dc4 997
db546cf8
S
998 # Look for BBC iPlayer embed
999 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1000 if matches:
476eae0c 1001 return _playlist_from_matches(matches, ie='BBCCoUk')
db546cf8 1002
93d020dd
S
1003 # Look for embedded RUTV player
1004 rutv_url = RUTVIE._extract_url(webpage)
1005 if rutv_url:
1006 return self.url_result(rutv_url, 'RUTV')
1007
7e2ede98
JMF
1008 # Look for embedded TED player
1009 mobj = re.search(
d7cc31b6 1010 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
7e2ede98
JMF
1011 if mobj is not None:
1012 return self.url_result(mobj.group('url'), 'TED')
1013
5c386252 1014 # Look for embedded Ustream videos
1015 mobj = re.search(
1016 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1017 if mobj is not None:
1018 return self.url_result(mobj.group('url'), 'Ustream')
1019
893f8832
PH
1020 # Look for embedded arte.tv player
1021 mobj = re.search(
1022 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1023 webpage)
1024 if mobj is not None:
1025 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1026
cb3ac1c6
S
1027 # Look for embedded smotri.com player
1028 smotri_url = SmotriIE._extract_url(webpage)
1029 if smotri_url:
1030 return self.url_result(smotri_url, 'Smotri')
1031
20991253
PH
1032 # Look for embeded soundcloud player
1033 mobj = re.search(
ac645ac7 1034 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
20991253
PH
1035 webpage)
1036 if mobj is not None:
1037 url = unescapeHTML(mobj.group('url'))
1038 return self.url_result(url)
1039
826ec77f
PH
1040 # Look for embedded vulture.com player
1041 mobj = re.search(
1042 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1043 webpage)
1044 if mobj is not None:
1045 url = unescapeHTML(mobj.group('url'))
1046 return self.url_result(url, ie='Vulture')
1047
c5cd249e
JMF
1048 # Look for embedded mtvservices player
1049 mobj = re.search(
1050 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1051 webpage)
1052 if mobj is not None:
1053 url = unescapeHTML(mobj.group('url'))
1054 return self.url_result(url, ie='MTVServicesEmbedded')
1055
49807b4a
S
1056 # Look for embedded yahoo player
1057 mobj = re.search(
1058 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1059 webpage)
1060 if mobj is not None:
1061 return self.url_result(mobj.group('url'), 'Yahoo')
1062
2ef6fcb5
PH
1063 # Look for embedded sbs.com.au player
1064 mobj = re.search(
e98b8e79
PH
1065 r'''(?x)
1066 (?:
1067 <meta\s+property="og:video"\s+content=|
1068 <iframe[^>]+?src=
1069 )
1070 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
2ef6fcb5
PH
1071 webpage)
1072 if mobj is not None:
1073 return self.url_result(mobj.group('url'), 'SBS')
1074
42bdd9d0
PH
1075 # Look for embedded Cinchcast player
1076 mobj = re.search(
1077 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1078 webpage)
1079 if mobj is not None:
1080 return self.url_result(mobj.group('url'), 'Cinchcast')
1081
1a94ff68 1082 mobj = re.search(
5263cdfc 1083 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1a94ff68
S
1084 webpage)
1085 if mobj is not None:
1086 return self.url_result(mobj.group('url'), 'MLB')
1087
1419fafd
S
1088 mobj = re.search(
1089 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1090 webpage)
1091 if mobj is not None:
1092 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1093
af63fed7
PH
1094 mobj = re.search(
1095 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1096 webpage)
1097 if mobj is not None:
1098 return self.url_result(mobj.group('url'), 'Livestream')
1099
ced659bb 1100 def check_video(vurl):
a0f71985
PH
1101 if YoutubeIE.suitable(vurl):
1102 return True
ced659bb
S
1103 vpath = compat_urlparse.urlparse(vurl).path
1104 vext = determine_ext(vpath)
1105 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1106
1107 def filter_video(urls):
1108 return list(filter(check_video, urls))
1109
9b122384 1110 # Start with something easy: JW Player in SWFObject
ced659bb 1111 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
b30b8698 1112 if not found:
d981cef6 1113 # Look for gorilla-vid style embedding
ced659bb 1114 found = filter_video(re.findall(r'''(?sx)
c0292e8a
PH
1115 (?:
1116 jw_plugins|
1117 JWPlayerOptions|
1118 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1119 )
a0f71985
PH
1120 .*?
1121 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
b30b8698 1122 if not found:
9b122384 1123 # Broaden the search a little bit
ced659bb 1124 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
b30b8698
PH
1125 if not found:
1126 # Broaden the findall a little bit: JWPlayer JS loader
ced659bb
S
1127 found = filter_video(re.findall(
1128 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
4d805e06
PH
1129 if not found:
1130 # Flow player
ced659bb 1131 found = filter_video(re.findall(r'''(?xs)
4d805e06
PH
1132 flowplayer\("[^"]+",\s*
1133 \{[^}]+?\}\s*,
52585fd6 1134 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
4d805e06 1135 ["']?url["']?\s*:\s*["']([^"']+)["']
ced659bb 1136 ''', webpage))
501f13fb
PH
1137 if not found:
1138 # Cinerama player
1139 found = re.findall(
1140 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
b30b8698 1141 if not found:
9b122384 1142 # Try to find twitter cards info
ced659bb
S
1143 found = filter_video(re.findall(
1144 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
b30b8698 1145 if not found:
9b122384
PH
1146 # We look for Open Graph info:
1147 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
b30b8698 1148 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
9b122384
PH
1149 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1150 if m_video_type is not None:
ced659bb 1151 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
b30b8698 1152 if not found:
7fea7156 1153 # HTML5 video
9b32eca3 1154 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
b30b8698 1155 if not found:
a5a45015 1156 found = re.search(
89ef304b 1157 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
a04aa7a9 1158 r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
89ef304b 1159 webpage)
b30b8698
PH
1160 if found:
1161 new_url = found.group(1)
89ef304b
PH
1162 self.report_following_redirect(new_url)
1163 return {
1164 '_type': 'url',
1165 'url': new_url,
1166 }
b30b8698 1167 if not found:
416c7fcb 1168 raise UnsupportedError(url)
9b122384 1169
b30b8698
PH
1170 entries = []
1171 for video_url in found:
1172 video_url = compat_urlparse.urljoin(url, video_url)
1173 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
9b122384 1174
b30b8698
PH
1175 # Sometimes, jwplayer extraction will result in a YouTube URL
1176 if YoutubeIE.suitable(video_url):
1177 entries.append(self.url_result(video_url, 'Youtube'))
1178 continue
9b122384 1179
b30b8698
PH
1180 # here's a fun little line of code for you:
1181 video_id = os.path.splitext(video_id)[0]
fc9713a1 1182
b30b8698
PH
1183 entries.append({
1184 'id': video_id,
1185 'url': video_url,
1186 'uploader': video_uploader,
1187 'title': video_title,
4d805e06 1188 'age_limit': age_limit,
b30b8698
PH
1189 })
1190
1191 if len(entries) == 1:
669f0e7c 1192 return entries[0]
b30b8698
PH
1193 else:
1194 for num, e in enumerate(entries, start=1):
1195 e['title'] = '%s (%d)' % (e['title'], num)
1196 return {
1197 '_type': 'playlist',
1198 'entries': entries,
1199 }