8 import xml
.etree
.ElementTree
10 from .common
import InfoExtractor
, SearchInfoExtractor
11 from .subtitles
import SubtitlesInfoExtractor
17 compat_urllib_request
,
28 class YoutubeBaseInfoExtractor(InfoExtractor
):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE
= 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED
= False
37 def report_lang(self
):
38 """Report attempt to set language."""
39 self
.to_screen(u
'Setting language')
41 def _set_language(self
):
42 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
45 compat_urllib_request
.urlopen(request
).read()
46 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
47 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
52 (username
, password
) = self
._get
_login
_info
()
53 # No authentication to be performed
55 if self
._LOGIN
_REQUIRED
:
56 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
59 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
61 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
62 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
63 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
68 match
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
)
71 match
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
)
77 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u
'PersistentCookie': u
'yes',
83 u
'bgresponse': u
'js_disabled',
84 u
'checkConnection': u
'',
85 u
'checkedDomains': u
'youtube',
91 u
'signIn': u
'Sign in',
93 u
'service': u
'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
100 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
101 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
104 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
105 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
106 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
108 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
109 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
113 def _confirm_age(self
):
116 'action_confirm': 'Confirm',
118 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
120 self
.report_age_confirmation()
121 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
122 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
123 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
126 def _real_initialize(self
):
127 if self
._downloader
is None:
129 if not self
._set
_language
():
131 if not self
._login
():
136 class YoutubeIE(YoutubeBaseInfoExtractor
, SubtitlesInfoExtractor
):
137 IE_DESC
= u
'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
154 |youtu\.be/ # just youtu.be/xxxx
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
160 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
161 # Listed in order of quality
162 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
163 # Apple HTTP Live Streaming
164 '96', '95', '94', '93', '92', '132', '151',
166 '85', '84', '102', '83', '101', '82', '100',
168 '138', '137', '248', '136', '247', '135', '246',
169 '245', '244', '134', '243', '133', '242', '160',
171 '141', '172', '140', '171', '139',
173 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '102', '84', '101', '83', '100', '82',
179 '138', '248', '137', '247', '136', '246', '245',
180 '244', '135', '243', '134', '242', '133', '160',
182 '172', '141', '171', '140', '139',
184 _video_formats_map
= {
185 'flv': ['35', '34', '6', '5'],
186 '3gp': ['36', '17', '13'],
187 'mp4': ['38', '37', '22', '18'],
188 'webm': ['46', '45', '44', '43'],
190 _video_extensions
= {
212 # Apple HTTP Live Streaming
244 _video_dimensions
= {
326 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
327 u
"file": u
"BaW_jenozKc.mp4",
329 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
330 u
"uploader": u
"Philipp Hagemeister",
331 u
"uploader_id": u
"phihag",
332 u
"upload_date": u
"20121002",
333 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
337 u
"url": u
"http://www.youtube.com/watch?v=1ltcDfZMA3U",
338 u
"file": u
"1ltcDfZMA3U.flv",
339 u
"note": u
"Test VEVO video (#897)",
341 u
"upload_date": u
"20070518",
342 u
"title": u
"Maps - It Will Find You",
343 u
"description": u
"Music video by Maps performing It Will Find You.",
344 u
"uploader": u
"MuteUSA",
345 u
"uploader_id": u
"MuteUSA"
349 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
350 u
"file": u
"UxxajLWwzqY.mp4",
351 u
"note": u
"Test generic use_cipher_signature video (#897)",
353 u
"upload_date": u
"20120506",
354 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
355 u
"description": u
"md5:3e2666e0a55044490499ea45fe9037b7",
356 u
"uploader": u
"Icona Pop",
357 u
"uploader_id": u
"IconaPop"
361 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
362 u
"file": u
"07FYdnEawAQ.mp4",
363 u
"note": u
"Test VEVO video with age protection (#956)",
365 u
"upload_date": u
"20130703",
366 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
367 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
368 u
"uploader": u
"justintimberlakeVEVO",
369 u
"uploader_id": u
"justintimberlakeVEVO"
373 u
'url': u
'https://www.youtube.com/watch?v=TGi3HqYrWHE',
374 u
'file': u
'TGi3HqYrWHE.mp4',
375 u
'note': u
'm3u8 video',
377 u
'title': u
'Triathlon - Men - London 2012 Olympic Games',
378 u
'description': u
'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
379 u
'uploader': u
'olympic',
380 u
'upload_date': u
'20120807',
381 u
'uploader_id': u
'olympic',
384 u
'skip_download': True,
391 def suitable(cls
, url
):
392 """Receives a URL and returns True if suitable for this IE."""
393 if YoutubePlaylistIE
.suitable(url
): return False
394 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
396 def report_video_webpage_download(self
, video_id
):
397 """Report attempt to download video webpage."""
398 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
400 def report_video_info_webpage_download(self
, video_id
):
401 """Report attempt to download video info webpage."""
402 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
404 def report_information_extraction(self
, video_id
):
405 """Report attempt to extract video information."""
406 self
.to_screen(u
'%s: Extracting video information' % video_id
)
408 def report_unavailable_format(self
, video_id
, format
):
409 """Report extracted video URL."""
410 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
412 def report_rtmp_download(self
):
413 """Indicate the download will use the RTMP protocol."""
414 self
.to_screen(u
'RTMP download detected')
416 def _decrypt_signature(self
, s
):
417 """Turn the encrypted s field into a working signature"""
420 return s
[86:29:-1] + s
[88] + s
[28:5:-1]
422 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
424 return s
[84:27:-1] + s
[86] + s
[26:5:-1]
426 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
428 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
430 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
432 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
434 return s
[5:34] + s
[0] + s
[35:38] + s
[3] + s
[39:45] + s
[38] + s
[46:53] + s
[73] + s
[54:73] + s
[85] + s
[74:85] + s
[53]
436 return s
[3:11] + s
[0] + s
[12:55] + s
[84] + s
[56:84]
438 return s
[81:36:-1] + s
[0] + s
[35:2:-1]
440 return s
[81:64:-1] + s
[82] + s
[63:52:-1] + s
[45] + s
[51:45:-1] + s
[1] + s
[44:1:-1] + s
[0]
442 return s
[80:73:-1] + s
[81] + s
[72:54:-1] + s
[2] + s
[53:43:-1] + s
[0] + s
[42:2:-1] + s
[43] + s
[1] + s
[54]
444 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
446 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
448 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
451 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
453 def _decrypt_signature_age_gate(self
, s
):
454 # The videos with age protection use another player, so the algorithms
457 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
459 # Fallback to the other algortihms
460 return self
._decrypt
_signature
(s
)
462 def _get_available_subtitles(self
, video_id
):
464 sub_list
= self
._download
_webpage
(
465 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
,
466 video_id
, note
=False)
467 except ExtractorError
as err
:
468 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
470 lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
475 params
= compat_urllib_parse
.urlencode({
478 'fmt': self
._downloader
.params
.get('subtitlesformat'),
480 url
= u
'http://www.youtube.com/api/timedtext?' + params
481 sub_lang_list
[lang
] = url
482 if not sub_lang_list
:
483 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
487 def _get_available_automatic_caption(self
, video_id
, webpage
):
488 """We need the webpage for getting the captions url, pass it as an
489 argument to speed up the process."""
490 sub_format
= self
._downloader
.params
.get('subtitlesformat')
491 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
492 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
493 err_msg
= u
'Couldn\'t find automatic captions for %s' % video_id
495 self
._downloader
.report_warning(err_msg
)
497 player_config
= json
.loads(mobj
.group(1))
499 args
= player_config
[u
'args']
500 caption_url
= args
[u
'ttsurl']
501 timestamp
= args
[u
'timestamp']
502 # We get the available subtitles
503 list_params
= compat_urllib_parse
.urlencode({
508 list_url
= caption_url
+ '&' + list_params
509 list_page
= self
._download
_webpage
(list_url
, video_id
)
510 caption_list
= xml
.etree
.ElementTree
.fromstring(list_page
.encode('utf-8'))
511 original_lang_node
= caption_list
.find('track')
512 if original_lang_node
.attrib
.get('kind') != 'asr' :
513 self
._downloader
.report_warning(u
'Video doesn\'t have automatic captions')
515 original_lang
= original_lang_node
.attrib
['lang_code']
518 for lang_node
in caption_list
.findall('target'):
519 sub_lang
= lang_node
.attrib
['lang_code']
520 params
= compat_urllib_parse
.urlencode({
521 'lang': original_lang
,
527 sub_lang_list
[sub_lang
] = caption_url
+ '&' + params
529 # An extractor error can be raise by the download process if there are
530 # no automatic captions but there are subtitles
531 except (KeyError, ExtractorError
):
532 self
._downloader
.report_warning(err_msg
)
535 def _print_formats(self
, formats
):
536 print('Available formats:')
538 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
539 self
._video
_dimensions
.get(x
, '???'),
540 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
542 def _extract_id(self
, url
):
543 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
545 raise ExtractorError(u
'Invalid URL: %s' % url
)
546 video_id
= mobj
.group(2)
549 def _get_video_url_list(self
, url_map
):
551 Transform a dictionary in the format {itag:url} to a list of (itag, url)
552 with the requested formats.
554 req_format
= self
._downloader
.params
.get('format', None)
555 format_limit
= self
._downloader
.params
.get('format_limit', None)
556 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
557 if format_limit
is not None and format_limit
in available_formats
:
558 format_list
= available_formats
[available_formats
.index(format_limit
):]
560 format_list
= available_formats
561 existing_formats
= [x
for x
in format_list
if x
in url_map
]
562 if len(existing_formats
) == 0:
563 raise ExtractorError(u
'no known formats available for video')
564 if self
._downloader
.params
.get('listformats', None):
565 self
._print
_formats
(existing_formats
)
567 if req_format
is None or req_format
== 'best':
568 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
569 elif req_format
== 'worst':
570 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
571 elif req_format
in ('-1', 'all'):
572 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
574 # Specific formats. We pick the first in a slash-delimeted sequence.
575 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
576 # available in the specified format. For example,
577 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
578 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
579 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
580 req_formats
= req_format
.split('/')
581 video_url_list
= None
582 for rf
in req_formats
:
584 video_url_list
= [(rf
, url_map
[rf
])]
586 if rf
in self
._video
_formats
_map
:
587 for srf
in self
._video
_formats
_map
[rf
]:
589 video_url_list
= [(srf
, url_map
[srf
])]
594 if video_url_list
is None:
595 raise ExtractorError(u
'requested format not available')
596 return video_url_list
598 def _extract_from_m3u8(self
, manifest_url
, video_id
):
600 def _get_urls(_manifest
):
601 lines
= _manifest
.split('\n')
602 urls
= filter(lambda l
: l
and not l
.startswith('#'),
605 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
606 formats_urls
= _get_urls(manifest
)
607 for format_url
in formats_urls
:
608 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
609 url_map
[itag
] = format_url
612 def _real_extract(self
, url
):
613 if re
.match(r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url
):
614 self
._downloader
.report_warning(u
'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
616 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
617 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
619 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
620 video_id
= self
._extract
_id
(url
)
623 self
.report_video_webpage_download(video_id
)
624 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
625 request
= compat_urllib_request
.Request(url
)
627 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
628 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
629 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
631 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
633 # Attempt to extract SWF player URL
634 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
636 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
641 self
.report_video_info_webpage_download(video_id
)
642 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
643 self
.report_age_confirmation()
645 # We simulate the access to the video from www.youtube.com/v/{video_id}
646 # this can be viewed without login into Youtube
647 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
651 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
655 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
656 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
658 errnote
='unable to download video info webpage')
659 video_info
= compat_parse_qs(video_info_webpage
)
662 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
663 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
664 % (video_id
, el_type
))
665 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
667 errnote
='unable to download video info webpage')
668 video_info
= compat_parse_qs(video_info_webpage
)
669 if 'token' in video_info
:
671 if 'token' not in video_info
:
672 if 'reason' in video_info
:
673 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
675 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
677 # Check for "rental" videos
678 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
679 raise ExtractorError(u
'"rental" videos not supported')
681 # Start extracting information
682 self
.report_information_extraction(video_id
)
685 if 'author' not in video_info
:
686 raise ExtractorError(u
'Unable to extract uploader name')
687 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
690 video_uploader_id
= None
691 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
693 video_uploader_id
= mobj
.group(1)
695 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
698 if 'title' not in video_info
:
699 raise ExtractorError(u
'Unable to extract video title')
700 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
703 # We try first to get a high quality image:
704 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
705 video_webpage
, re
.DOTALL
)
706 if m_thumb
is not None:
707 video_thumbnail
= m_thumb
.group(1)
708 elif 'thumbnail_url' not in video_info
:
709 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
711 else: # don't panic if we can't find it
712 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
716 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
718 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
719 upload_date
= unified_strdate(upload_date
)
722 video_description
= get_element_by_id("eow-description", video_webpage
)
723 if video_description
:
724 video_description
= clean_html(video_description
)
726 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
728 video_description
= unescapeHTML(fd_mobj
.group(1))
730 video_description
= u
''
733 video_subtitles
= self
.extract_subtitles(video_id
, video_webpage
)
735 if self
._downloader
.params
.get('listsubtitles', False):
736 self
._list
_available
_subtitles
(video_id
, video_webpage
)
739 if 'length_seconds' not in video_info
:
740 self
._downloader
.report_warning(u
'unable to extract video duration')
743 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
745 # Decide which formats to download
748 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
750 raise ValueError('Could not find vevo ID')
751 info
= json
.loads(mobj
.group(1))
753 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
754 # this signatures are encrypted
755 m_s
= re
.search(r
'[&,]s=', args
['url_encoded_fmt_stream_map'])
757 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
758 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
759 m_s
= re
.search(r
'[&,]s=', args
.get('adaptive_fmts', u
''))
761 if 'url_encoded_fmt_stream_map' in video_info
:
762 video_info
['url_encoded_fmt_stream_map'][0] += ',' + args
['adaptive_fmts']
764 video_info
['url_encoded_fmt_stream_map'] = [args
['adaptive_fmts']]
765 elif 'adaptive_fmts' in video_info
:
766 if 'url_encoded_fmt_stream_map' in video_info
:
767 video_info
['url_encoded_fmt_stream_map'][0] += ',' + video_info
['adaptive_fmts'][0]
769 video_info
['url_encoded_fmt_stream_map'] = video_info
['adaptive_fmts']
773 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
774 self
.report_rtmp_download()
775 video_url_list
= [(None, video_info
['conn'][0])]
776 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
777 if 'rtmpe%3Dyes' in video_info
['url_encoded_fmt_stream_map'][0]:
778 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
780 for url_data_str
in video_info
['url_encoded_fmt_stream_map'][0].split(','):
781 url_data
= compat_parse_qs(url_data_str
)
782 if 'itag' in url_data
and 'url' in url_data
:
783 url
= url_data
['url'][0]
784 if 'sig' in url_data
:
785 url
+= '&signature=' + url_data
['sig'][0]
786 elif 's' in url_data
:
787 if self
._downloader
.params
.get('verbose'):
790 player
= 'flash player'
792 player
= u
'html5 player %s' % self
._search
_regex
(r
'html5player-(.+?)\.js', video_webpage
,
793 'html5 player', fatal
=False)
794 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in s
.split('.'))
795 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
796 (len(s
), parts_sizes
, url_data
['itag'][0], player
))
797 encrypted_sig
= url_data
['s'][0]
799 signature
= self
._decrypt
_signature
_age
_gate
(encrypted_sig
)
801 signature
= self
._decrypt
_signature
(encrypted_sig
)
802 url
+= '&signature=' + signature
803 if 'ratebypass' not in url
:
804 url
+= '&ratebypass=yes'
805 url_map
[url_data
['itag'][0]] = url
806 video_url_list
= self
._get
_video
_url
_list
(url_map
)
807 if not video_url_list
:
809 elif video_info
.get('hlsvp'):
810 manifest_url
= video_info
['hlsvp'][0]
811 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
812 video_url_list
= self
._get
_video
_url
_list
(url_map
)
813 if not video_url_list
:
817 raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info')
820 for format_param
, video_real_url
in video_url_list
:
822 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
824 video_format
= '{0} - {1}{2}'.format(format_param
if format_param
else video_extension
,
825 self
._video
_dimensions
.get(format_param
, '???'),
826 ' ('+self
._special
_itags
[format_param
]+')' if format_param
in self
._special
_itags
else '')
830 'url': video_real_url
,
831 'uploader': video_uploader
,
832 'uploader_id': video_uploader_id
,
833 'upload_date': upload_date
,
834 'title': video_title
,
835 'ext': video_extension
,
836 'format': video_format
,
837 'thumbnail': video_thumbnail
,
838 'description': video_description
,
839 'player_url': player_url
,
840 'subtitles': video_subtitles
,
841 'duration': video_duration
845 class YoutubePlaylistIE(InfoExtractor
):
846 IE_DESC
= u
'YouTube.com playlists'
852 (?:course|view_play_list|my_playlists|artist|playlist|watch)
853 \? (?:.*?&)*? (?:p|a|list)=
856 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
859 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
861 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
863 IE_NAME
= u
'youtube:playlist'
866 def suitable(cls
, url
):
867 """Receives a URL and returns True if suitable for this IE."""
868 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
870 def _real_extract(self
, url
):
871 # Extract playlist id
872 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
874 raise ExtractorError(u
'Invalid URL: %s' % url
)
876 # Download playlist videos from API
877 playlist_id
= mobj
.group(1) or mobj
.group(2)
880 for page_num
in itertools
.count(1):
881 start_index
= self
._MAX
_RESULTS
* (page_num
- 1) + 1
882 if start_index
>= 1000:
883 self
._downloader
.report_warning(u
'Max number of results reached')
885 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, start_index
)
886 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
889 response
= json
.loads(page
)
890 except ValueError as err
:
891 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
893 if 'feed' not in response
:
894 raise ExtractorError(u
'Got a malformed response from YouTube API')
895 playlist_title
= response
['feed']['title']['$t']
896 if 'entry' not in response
['feed']:
897 # Number of videos is a multiple of self._MAX_RESULTS
900 for entry
in response
['feed']['entry']:
901 index
= entry
['yt$position']['$t']
902 if 'media$group' in entry
and 'yt$videoid' in entry
['media$group']:
905 'https://www.youtube.com/watch?v=' + entry
['media$group']['yt$videoid']['$t']
908 videos
= [v
[1] for v
in sorted(videos
)]
910 url_results
= [self
.url_result(vurl
, 'Youtube') for vurl
in videos
]
911 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
914 class YoutubeChannelIE(InfoExtractor
):
915 IE_DESC
= u
'YouTube.com channels'
916 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
917 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
918 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
919 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
920 IE_NAME
= u
'youtube:channel'
922 def extract_videos_from_page(self
, page
):
924 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
925 if mobj
.group(1) not in ids_in_page
:
926 ids_in_page
.append(mobj
.group(1))
929 def _real_extract(self
, url
):
931 mobj
= re
.match(self
._VALID
_URL
, url
)
933 raise ExtractorError(u
'Invalid URL: %s' % url
)
935 # Download channel page
936 channel_id
= mobj
.group(1)
940 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
941 page
= self
._download
_webpage
(url
, channel_id
,
942 u
'Downloading page #%s' % pagenum
)
944 # Extract video identifiers
945 ids_in_page
= self
.extract_videos_from_page(page
)
946 video_ids
.extend(ids_in_page
)
948 # Download any subsequent channel pages using the json-based channel_ajax query
949 if self
._MORE
_PAGES
_INDICATOR
in page
:
950 for pagenum
in itertools
.count(1):
951 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
952 page
= self
._download
_webpage
(url
, channel_id
,
953 u
'Downloading page #%s' % pagenum
)
955 page
= json
.loads(page
)
957 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
958 video_ids
.extend(ids_in_page
)
960 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
963 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
965 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
966 url_entries
= [self
.url_result(eurl
, 'Youtube') for eurl
in urls
]
967 return [self
.playlist_result(url_entries
, channel_id
)]
970 class YoutubeUserIE(InfoExtractor
):
971 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
972 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
973 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
974 _GDATA_PAGE_SIZE
= 50
975 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
976 IE_NAME
= u
'youtube:user'
979 def suitable(cls
, url
):
980 # Don't return True if the url can be extracted with other youtube
981 # extractor, the regex would is too permissive and it would match.
982 other_ies
= iter(klass
for (name
, klass
) in globals().items() if name
.endswith('IE') and klass
is not cls
)
983 if any(ie
.suitable(url
) for ie
in other_ies
): return False
984 else: return super(YoutubeUserIE
, cls
).suitable(url
)
986 def _real_extract(self
, url
):
988 mobj
= re
.match(self
._VALID
_URL
, url
)
990 raise ExtractorError(u
'Invalid URL: %s' % url
)
992 username
= mobj
.group(1)
994 # Download video ids using YouTube Data API. Result size per
995 # query is limited (currently to 50 videos) so we need to query
996 # page by page until there are no video ids - it means we got
1001 for pagenum
in itertools
.count(0):
1002 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1004 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1005 page
= self
._download
_webpage
(gdata_url
, username
,
1006 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1009 response
= json
.loads(page
)
1010 except ValueError as err
:
1011 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1012 if 'entry' not in response
['feed']:
1013 # Number of videos is a multiple of self._MAX_RESULTS
1016 # Extract video identifiers
1018 for entry
in response
['feed']['entry']:
1019 ids_in_page
.append(entry
['id']['$t'].split('/')[-1])
1020 video_ids
.extend(ids_in_page
)
1022 # A little optimization - if current page is not
1023 # "full", ie. does not contain PAGE_SIZE video ids then
1024 # we can assume that this page is the last one - there
1025 # are no more ids on further pages - no need to query
1028 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1031 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1032 url_results
= [self
.url_result(rurl
, 'Youtube') for rurl
in urls
]
1033 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1035 class YoutubeSearchIE(SearchInfoExtractor
):
1036 IE_DESC
= u
'YouTube.com searches'
1037 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1039 IE_NAME
= u
'youtube:search'
1040 _SEARCH_KEY
= 'ytsearch'
1042 def report_download_page(self
, query
, pagenum
):
1043 """Report attempt to download search page with given number."""
1044 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1046 def _get_n_results(self
, query
, n
):
1047 """Get a specified number of results for a query"""
1053 while (50 * pagenum
) < limit
:
1054 self
.report_download_page(query
, pagenum
+1)
1055 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1056 request
= compat_urllib_request
.Request(result_url
)
1058 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1059 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1060 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1061 api_response
= json
.loads(data
)['data']
1063 if not 'items' in api_response
:
1064 raise ExtractorError(u
'[youtube] No video results')
1066 new_ids
= list(video
['id'] for video
in api_response
['items'])
1067 video_ids
+= new_ids
1069 limit
= min(n
, api_response
['totalItems'])
1072 if len(video_ids
) > n
:
1073 video_ids
= video_ids
[:n
]
1074 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1075 return self
.playlist_result(videos
, query
)
1078 class YoutubeShowIE(InfoExtractor
):
1079 IE_DESC
= u
'YouTube.com (multi-season) shows'
1080 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1081 IE_NAME
= u
'youtube:show'
1083 def _real_extract(self
, url
):
1084 mobj
= re
.match(self
._VALID
_URL
, url
)
1085 show_name
= mobj
.group(1)
1086 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1087 # There's one playlist for each season of the show
1088 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1089 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1090 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1093 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1095 Base class for extractors that fetch info from
1096 http://www.youtube.com/feed_ajax
1097 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1099 _LOGIN_REQUIRED
= True
1101 # use action_load_personal_feed instead of action_load_system_feed
1102 _PERSONAL_FEED
= False
1105 def _FEED_TEMPLATE(self
):
1106 action
= 'action_load_system_feed'
1107 if self
._PERSONAL
_FEED
:
1108 action
= 'action_load_personal_feed'
1109 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1113 return u
'youtube:%s' % self
._FEED
_NAME
1115 def _real_initialize(self
):
1118 def _real_extract(self
, url
):
1120 # The step argument is available only in 2.7 or higher
1121 for i
in itertools
.count(0):
1122 paging
= i
*self
._PAGING
_STEP
1123 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1124 u
'%s feed' % self
._FEED
_NAME
,
1125 u
'Downloading page %s' % i
)
1126 info
= json
.loads(info
)
1127 feed_html
= info
['feed_html']
1128 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1129 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1130 feed_entries
.extend(self
.url_result(id, 'Youtube') for id in ids
)
1131 if info
['paging'] is None:
1133 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1135 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1136 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1137 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1138 _FEED_NAME
= 'subscriptions'
1139 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1141 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1142 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1143 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1144 _FEED_NAME
= 'recommended'
1145 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1147 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1148 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1149 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1150 _FEED_NAME
= 'watch_later'
1151 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1153 _PERSONAL_FEED
= True
1155 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1156 IE_NAME
= u
'youtube:favorites'
1157 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1158 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1159 _LOGIN_REQUIRED
= True
1161 def _real_extract(self
, url
):
1162 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1163 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1164 return self
.url_result(playlist_id
, 'YoutubePlaylist')