9 from .common
import InfoExtractor
, SearchInfoExtractor
15 compat_urllib_request
,
26 class YoutubeBaseInfoExtractor(InfoExtractor
):
27 """Provide base functions for Youtube extractors"""
28 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
29 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
30 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
31 _NETRC_MACHINE
= 'youtube'
32 # If True it will raise an error if no login info is provided
33 _LOGIN_REQUIRED
= False
35 def report_lang(self
):
36 """Report attempt to set language."""
37 self
.to_screen(u
'Setting language')
39 def _set_language(self
):
40 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
43 compat_urllib_request
.urlopen(request
).read()
44 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
45 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
50 (username
, password
) = self
._get
_login
_info
()
51 # No authentication to be performed
53 if self
._LOGIN
_REQUIRED
:
54 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
57 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
59 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
60 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
61 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
66 match
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
)
69 match
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
)
75 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 u
'PersistentCookie': u
'yes',
81 u
'bgresponse': u
'js_disabled',
82 u
'checkConnection': u
'',
83 u
'checkedDomains': u
'youtube',
89 u
'signIn': u
'Sign in',
91 u
'service': u
'youtube',
95 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
97 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
98 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
99 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
102 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
103 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
104 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
106 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
107 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
111 def _confirm_age(self
):
114 'action_confirm': 'Confirm',
116 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
118 self
.report_age_confirmation()
119 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
120 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
121 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
124 def _real_initialize(self
):
125 if self
._downloader
is None:
127 if not self
._set
_language
():
129 if not self
._login
():
133 class YoutubeIE(YoutubeBaseInfoExtractor
):
134 IE_DESC
= u
'YouTube.com'
137 (?:https?://)? # http(s):// (optional)
138 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
139 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
140 (?:.*?\#/)? # handle anchor (#/) redirect urls
141 (?: # the various things that can precede the ID:
142 (?:(?:v|embed|e)/) # v/ or embed/ or e/
143 |(?: # or the v= param in all its forms
144 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
145 (?:\?|\#!?) # the params delimiter ? or # or #!
146 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 |youtu\.be/ # just youtu.be/xxxx
152 )? # all until now is optional -> you can pass the naked ID
153 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
154 (?(1).+)? # if we found the ID, everything can follow
156 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
157 # Listed in order of quality
158 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
159 # Apple HTTP Live Streaming
160 '96', '95', '94', '93', '92', '132', '151',
162 '85', '84', '102', '83', '101', '82', '100',
164 '138', '137', '248', '136', '247', '135', '246',
165 '245', '244', '134', '243', '133', '242', '160',
167 '141', '172', '140', '171', '139',
169 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
170 # Apple HTTP Live Streaming
171 '96', '95', '94', '93', '92', '132', '151',
173 '85', '102', '84', '101', '83', '100', '82',
175 '138', '248', '137', '247', '136', '246', '245',
176 '244', '135', '243', '134', '242', '133', '160',
178 '172', '141', '171', '140', '139',
180 _video_formats_map
= {
181 'flv': ['35', '34', '6', '5'],
182 '3gp': ['36', '17', '13'],
183 'mp4': ['38', '37', '22', '18'],
184 'webm': ['46', '45', '44', '43'],
186 _video_extensions
= {
208 # Apple HTTP Live Streaming
240 _video_dimensions
= {
322 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
323 u
"file": u
"BaW_jenozKc.mp4",
325 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
326 u
"uploader": u
"Philipp Hagemeister",
327 u
"uploader_id": u
"phihag",
328 u
"upload_date": u
"20121002",
329 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
333 u
"url": u
"http://www.youtube.com/watch?v=1ltcDfZMA3U",
334 u
"file": u
"1ltcDfZMA3U.flv",
335 u
"note": u
"Test VEVO video (#897)",
337 u
"upload_date": u
"20070518",
338 u
"title": u
"Maps - It Will Find You",
339 u
"description": u
"Music video by Maps performing It Will Find You.",
340 u
"uploader": u
"MuteUSA",
341 u
"uploader_id": u
"MuteUSA"
345 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
346 u
"file": u
"UxxajLWwzqY.mp4",
347 u
"note": u
"Test generic use_cipher_signature video (#897)",
349 u
"upload_date": u
"20120506",
350 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
351 u
"description": u
"md5:3e2666e0a55044490499ea45fe9037b7",
352 u
"uploader": u
"Icona Pop",
353 u
"uploader_id": u
"IconaPop"
357 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
358 u
"file": u
"07FYdnEawAQ.mp4",
359 u
"note": u
"Test VEVO video with age protection (#956)",
361 u
"upload_date": u
"20130703",
362 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
363 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
364 u
"uploader": u
"justintimberlakeVEVO",
365 u
"uploader_id": u
"justintimberlakeVEVO"
369 u
'url': u
'https://www.youtube.com/watch?v=TGi3HqYrWHE',
370 u
'file': u
'TGi3HqYrWHE.mp4',
371 u
'note': u
'm3u8 video',
373 u
'title': u
'Triathlon - Men - London 2012 Olympic Games',
374 u
'description': u
'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
375 u
'uploader': u
'olympic',
376 u
'upload_date': u
'20120807',
377 u
'uploader_id': u
'olympic',
380 u
'skip_download': True,
387 def suitable(cls
, url
):
388 """Receives a URL and returns True if suitable for this IE."""
389 if YoutubePlaylistIE
.suitable(url
) or YoutubeSubscriptionsIE
.suitable(url
): return False
390 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
392 def report_video_webpage_download(self
, video_id
):
393 """Report attempt to download video webpage."""
394 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
396 def report_video_info_webpage_download(self
, video_id
):
397 """Report attempt to download video info webpage."""
398 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
400 def report_video_subtitles_download(self
, video_id
):
401 """Report attempt to download video info webpage."""
402 self
.to_screen(u
'%s: Checking available subtitles' % video_id
)
404 def report_video_subtitles_request(self
, video_id
, sub_lang
, format
):
405 """Report attempt to download video info webpage."""
406 self
.to_screen(u
'%s: Downloading video subtitles for %s.%s' % (video_id
, sub_lang
, format
))
408 def report_video_subtitles_available(self
, video_id
, sub_lang_list
):
409 """Report available subtitles."""
410 sub_lang
= ",".join(list(sub_lang_list
.keys()))
411 self
.to_screen(u
'%s: Available subtitles for video: %s' % (video_id
, sub_lang
))
413 def report_information_extraction(self
, video_id
):
414 """Report attempt to extract video information."""
415 self
.to_screen(u
'%s: Extracting video information' % video_id
)
417 def report_unavailable_format(self
, video_id
, format
):
418 """Report extracted video URL."""
419 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
421 def report_rtmp_download(self
):
422 """Indicate the download will use the RTMP protocol."""
423 self
.to_screen(u
'RTMP download detected')
425 def _decrypt_signature(self
, s
):
426 """Turn the encrypted s field into a working signature"""
429 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
431 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
433 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
435 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
437 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
439 return s
[5:34] + s
[0] + s
[35:38] + s
[3] + s
[39:45] + s
[38] + s
[46:53] + s
[73] + s
[54:73] + s
[85] + s
[74:85] + s
[53]
441 return s
[83:34:-1] + s
[0] + s
[33:27:-1] + s
[3] + s
[26:19:-1] + s
[34] + s
[18:3:-1] + s
[27]
443 return s
[81:36:-1] + s
[0] + s
[35:2:-1]
445 return s
[81:64:-1] + s
[82] + s
[63:52:-1] + s
[45] + s
[51:45:-1] + s
[1] + s
[44:1:-1] + s
[0]
447 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:82]
449 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
451 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
453 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
456 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
458 def _decrypt_signature_age_gate(self
, s
):
459 # The videos with age protection use another player, so the algorithms
462 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
464 # Fallback to the other algortihms
465 return self
._decrypt
_signature
(s
)
468 def _get_available_subtitles(self
, video_id
):
469 self
.report_video_subtitles_download(video_id
)
470 request
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
472 sub_list
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
473 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
474 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
476 sub_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
477 sub_lang_list
= dict((l
[1], l
[0]) for l
in sub_lang_list
)
478 if not sub_lang_list
:
479 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
483 def _list_available_subtitles(self
, video_id
):
484 sub_lang_list
= self
._get
_available
_subtitles
(video_id
)
485 self
.report_video_subtitles_available(video_id
, sub_lang_list
)
487 def _request_subtitle(self
, sub_lang
, sub_name
, video_id
, format
):
489 Return the subtitle as a string or None if they are not found
491 self
.report_video_subtitles_request(video_id
, sub_lang
, format
)
492 params
= compat_urllib_parse
.urlencode({
498 url
= 'http://www.youtube.com/api/timedtext?' + params
500 sub
= compat_urllib_request
.urlopen(url
).read().decode('utf-8')
501 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
502 self
._downloader
.report_warning(u
'unable to download video subtitles for %s: %s' % (sub_lang
, compat_str(err
)))
505 self
._downloader
.report_warning(u
'Did not fetch video subtitles')
509 def _request_automatic_caption(self
, video_id
, webpage
):
510 """We need the webpage for getting the captions url, pass it as an
511 argument to speed up the process."""
512 sub_lang
= (self
._downloader
.params
.get('subtitleslangs') or ['en'])[0]
513 sub_format
= self
._downloader
.params
.get('subtitlesformat')
514 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
515 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
516 err_msg
= u
'Couldn\'t find automatic captions for "%s"' % sub_lang
518 self
._downloader
.report_warning(err_msg
)
520 player_config
= json
.loads(mobj
.group(1))
522 args
= player_config
[u
'args']
523 caption_url
= args
[u
'ttsurl']
524 timestamp
= args
[u
'timestamp']
525 params
= compat_urllib_parse
.urlencode({
532 subtitles_url
= caption_url
+ '&' + params
533 sub
= self
._download
_webpage
(subtitles_url
, video_id
, u
'Downloading automatic captions')
534 return {sub_lang: sub}
535 # An extractor error can be raise by the download process if there are
536 # no automatic captions but there are subtitles
537 except (KeyError, ExtractorError
):
538 self
._downloader
.report_warning(err_msg
)
541 def _extract_subtitles(self
, video_id
):
543 Return a dictionary: {language: subtitles} or {} if the subtitles
546 available_subs_list
= self
._get
_available
_subtitles
(video_id
)
547 sub_format
= self
._downloader
.params
.get('subtitlesformat')
548 if not available_subs_list
: #There was some error, it didn't get the available subtitles
550 if self
._downloader
.params
.get('allsubtitles', False):
551 sub_lang_list
= available_subs_list
553 if self
._downloader
.params
.get('subtitleslangs', False):
554 reqested_langs
= self
._downloader
.params
.get('subtitleslangs')
555 elif 'en' in available_subs_list
:
556 reqested_langs
= ['en']
558 reqested_langs
= [list(available_subs_list
.keys())[0]]
561 for sub_lang
in reqested_langs
:
562 if not sub_lang
in available_subs_list
:
563 self
._downloader
.report_warning(u
'no closed captions found in the specified language "%s"' % sub_lang
)
565 sub_lang_list
[sub_lang
] = available_subs_list
[sub_lang
]
567 for sub_lang
in sub_lang_list
:
568 subtitle
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
)
570 subtitles
[sub_lang
] = subtitle
573 def _print_formats(self
, formats
):
574 print('Available formats:')
576 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
577 self
._video
_dimensions
.get(x
, '???'),
578 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
580 def _extract_id(self
, url
):
581 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
583 raise ExtractorError(u
'Invalid URL: %s' % url
)
584 video_id
= mobj
.group(2)
587 def _get_video_url_list(self
, url_map
):
589 Transform a dictionary in the format {itag:url} to a list of (itag, url)
590 with the requested formats.
592 req_format
= self
._downloader
.params
.get('format', None)
593 format_limit
= self
._downloader
.params
.get('format_limit', None)
594 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
595 if format_limit
is not None and format_limit
in available_formats
:
596 format_list
= available_formats
[available_formats
.index(format_limit
):]
598 format_list
= available_formats
599 existing_formats
= [x
for x
in format_list
if x
in url_map
]
600 if len(existing_formats
) == 0:
601 raise ExtractorError(u
'no known formats available for video')
602 if self
._downloader
.params
.get('listformats', None):
603 self
._print
_formats
(existing_formats
)
605 if req_format
is None or req_format
== 'best':
606 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
607 elif req_format
== 'worst':
608 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
609 elif req_format
in ('-1', 'all'):
610 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
612 # Specific formats. We pick the first in a slash-delimeted sequence.
613 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
614 # available in the specified format. For example,
615 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
616 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
617 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
618 req_formats
= req_format
.split('/')
619 video_url_list
= None
620 for rf
in req_formats
:
622 video_url_list
= [(rf
, url_map
[rf
])]
624 if rf
in self
._video
_formats
_map
:
625 for srf
in self
._video
_formats
_map
[rf
]:
627 video_url_list
= [(srf
, url_map
[srf
])]
632 if video_url_list
is None:
633 raise ExtractorError(u
'requested format not available')
634 return video_url_list
636 def _extract_from_m3u8(self
, manifest_url
, video_id
):
638 def _get_urls(_manifest
):
639 lines
= _manifest
.split('\n')
640 urls
= filter(lambda l
: l
and not l
.startswith('#'),
643 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
644 formats_urls
= _get_urls(manifest
)
645 for format_url
in formats_urls
:
646 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
647 url_map
[itag
] = format_url
650 def _real_extract(self
, url
):
651 if re
.match(r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url
):
652 self
._downloader
.report_warning(u
'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
654 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
655 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
657 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
658 video_id
= self
._extract
_id
(url
)
661 self
.report_video_webpage_download(video_id
)
662 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
663 request
= compat_urllib_request
.Request(url
)
665 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
666 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
667 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
669 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
671 # Attempt to extract SWF player URL
672 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
674 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
679 self
.report_video_info_webpage_download(video_id
)
680 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
681 self
.report_age_confirmation()
683 # We simulate the access to the video from www.youtube.com/v/{video_id}
684 # this can be viewed without login into Youtube
685 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
689 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
693 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
694 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
696 errnote
='unable to download video info webpage')
697 video_info
= compat_parse_qs(video_info_webpage
)
700 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
701 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
702 % (video_id
, el_type
))
703 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
705 errnote
='unable to download video info webpage')
706 video_info
= compat_parse_qs(video_info_webpage
)
707 if 'token' in video_info
:
709 if 'token' not in video_info
:
710 if 'reason' in video_info
:
711 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
713 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
715 # Check for "rental" videos
716 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
717 raise ExtractorError(u
'"rental" videos not supported')
719 # Start extracting information
720 self
.report_information_extraction(video_id
)
723 if 'author' not in video_info
:
724 raise ExtractorError(u
'Unable to extract uploader name')
725 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
728 video_uploader_id
= None
729 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
731 video_uploader_id
= mobj
.group(1)
733 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
736 if 'title' not in video_info
:
737 raise ExtractorError(u
'Unable to extract video title')
738 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
741 # We try first to get a high quality image:
742 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
743 video_webpage
, re
.DOTALL
)
744 if m_thumb
is not None:
745 video_thumbnail
= m_thumb
.group(1)
746 elif 'thumbnail_url' not in video_info
:
747 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
749 else: # don't panic if we can't find it
750 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
754 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
756 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
757 upload_date
= unified_strdate(upload_date
)
760 video_description
= get_element_by_id("eow-description", video_webpage
)
761 if video_description
:
762 video_description
= clean_html(video_description
)
764 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
766 video_description
= unescapeHTML(fd_mobj
.group(1))
768 video_description
= u
''
771 video_subtitles
= None
773 if self
._downloader
.params
.get('writesubtitles', False) or self
._downloader
.params
.get('allsubtitles', False):
774 video_subtitles
= self
._extract
_subtitles
(video_id
)
775 elif self
._downloader
.params
.get('writeautomaticsub', False):
776 video_subtitles
= self
._request
_automatic
_caption
(video_id
, video_webpage
)
778 if self
._downloader
.params
.get('listsubtitles', False):
779 self
._list
_available
_subtitles
(video_id
)
782 if 'length_seconds' not in video_info
:
783 self
._downloader
.report_warning(u
'unable to extract video duration')
786 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
788 # Decide which formats to download
791 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
793 raise ValueError('Could not find vevo ID')
794 info
= json
.loads(mobj
.group(1))
796 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
797 # this signatures are encrypted
798 m_s
= re
.search(r
'[&,]s=', args
['url_encoded_fmt_stream_map'])
800 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
801 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
802 m_s
= re
.search(r
'[&,]s=', args
.get('adaptive_fmts', u
''))
804 if 'url_encoded_fmt_stream_map' in video_info
:
805 video_info
['url_encoded_fmt_stream_map'][0] += ',' + args
['adaptive_fmts']
807 video_info
['url_encoded_fmt_stream_map'] = [args
['adaptive_fmts']]
808 elif 'adaptive_fmts' in video_info
:
809 if 'url_encoded_fmt_stream_map' in video_info
:
810 video_info
['url_encoded_fmt_stream_map'][0] += ',' + video_info
['adaptive_fmts'][0]
812 video_info
['url_encoded_fmt_stream_map'] = video_info
['adaptive_fmts']
816 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
817 self
.report_rtmp_download()
818 video_url_list
= [(None, video_info
['conn'][0])]
819 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
820 if 'rtmpe%3Dyes' in video_info
['url_encoded_fmt_stream_map'][0]:
821 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
823 for url_data_str
in video_info
['url_encoded_fmt_stream_map'][0].split(','):
824 url_data
= compat_parse_qs(url_data_str
)
825 if 'itag' in url_data
and 'url' in url_data
:
826 url
= url_data
['url'][0]
827 if 'sig' in url_data
:
828 url
+= '&signature=' + url_data
['sig'][0]
829 elif 's' in url_data
:
830 if self
._downloader
.params
.get('verbose'):
833 player_version
= self
._search
_regex
(r
'ad3-(.+?)\.swf',
834 video_info
['ad3_module'][0] if 'ad3_module' in video_info
else 'NOT FOUND',
835 'flash player', fatal
=False)
836 player
= 'flash player %s' % player_version
838 player
= u
'html5 player %s' % self
._search
_regex
(r
'html5player-(.+?)\.js', video_webpage
,
839 'html5 player', fatal
=False)
840 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in s
.split('.'))
841 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
842 (len(s
), parts_sizes
, url_data
['itag'][0], player
))
843 encrypted_sig
= url_data
['s'][0]
845 signature
= self
._decrypt
_signature
_age
_gate
(encrypted_sig
)
847 signature
= self
._decrypt
_signature
(encrypted_sig
)
848 url
+= '&signature=' + signature
849 if 'ratebypass' not in url
:
850 url
+= '&ratebypass=yes'
851 url_map
[url_data
['itag'][0]] = url
852 video_url_list
= self
._get
_video
_url
_list
(url_map
)
853 if not video_url_list
:
855 elif video_info
.get('hlsvp'):
856 manifest_url
= video_info
['hlsvp'][0]
857 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
858 video_url_list
= self
._get
_video
_url
_list
(url_map
)
859 if not video_url_list
:
863 raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info')
866 for format_param
, video_real_url
in video_url_list
:
868 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
870 video_format
= '{0} - {1}{2}'.format(format_param
if format_param
else video_extension
,
871 self
._video
_dimensions
.get(format_param
, '???'),
872 ' ('+self
._special
_itags
[format_param
]+')' if format_param
in self
._special
_itags
else '')
876 'url': video_real_url
,
877 'uploader': video_uploader
,
878 'uploader_id': video_uploader_id
,
879 'upload_date': upload_date
,
880 'title': video_title
,
881 'ext': video_extension
,
882 'format': video_format
,
883 'thumbnail': video_thumbnail
,
884 'description': video_description
,
885 'player_url': player_url
,
886 'subtitles': video_subtitles
,
887 'duration': video_duration
891 class YoutubePlaylistIE(InfoExtractor
):
892 IE_DESC
= u
'YouTube.com playlists'
898 (?:course|view_play_list|my_playlists|artist|playlist|watch)
899 \? (?:.*?&)*? (?:p|a|list)=
902 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
905 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
907 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
909 IE_NAME
= u
'youtube:playlist'
912 def suitable(cls
, url
):
913 """Receives a URL and returns True if suitable for this IE."""
914 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
916 def _real_extract(self
, url
):
917 # Extract playlist id
918 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
920 raise ExtractorError(u
'Invalid URL: %s' % url
)
922 # Download playlist videos from API
923 playlist_id
= mobj
.group(1) or mobj
.group(2)
926 for page_num
in itertools
.count(1):
927 start_index
= self
._MAX
_RESULTS
* (page_num
- 1) + 1
928 if start_index
>= 1000:
929 self
._downloader
.report_warning(u
'Max number of results reached')
931 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, start_index
)
932 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
935 response
= json
.loads(page
)
936 except ValueError as err
:
937 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
939 if 'feed' not in response
:
940 raise ExtractorError(u
'Got a malformed response from YouTube API')
941 playlist_title
= response
['feed']['title']['$t']
942 if 'entry' not in response
['feed']:
943 # Number of videos is a multiple of self._MAX_RESULTS
946 for entry
in response
['feed']['entry']:
947 index
= entry
['yt$position']['$t']
948 if 'media$group' in entry
and 'yt$videoid' in entry
['media$group']:
951 'https://www.youtube.com/watch?v=' + entry
['media$group']['yt$videoid']['$t']
954 videos
= [v
[1] for v
in sorted(videos
)]
956 url_results
= [self
.url_result(vurl
, 'Youtube') for vurl
in videos
]
957 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
960 class YoutubeChannelIE(InfoExtractor
):
961 IE_DESC
= u
'YouTube.com channels'
962 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
963 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
964 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
965 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
966 IE_NAME
= u
'youtube:channel'
968 def extract_videos_from_page(self
, page
):
970 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
971 if mobj
.group(1) not in ids_in_page
:
972 ids_in_page
.append(mobj
.group(1))
975 def _real_extract(self
, url
):
977 mobj
= re
.match(self
._VALID
_URL
, url
)
979 raise ExtractorError(u
'Invalid URL: %s' % url
)
981 # Download channel page
982 channel_id
= mobj
.group(1)
986 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
987 page
= self
._download
_webpage
(url
, channel_id
,
988 u
'Downloading page #%s' % pagenum
)
990 # Extract video identifiers
991 ids_in_page
= self
.extract_videos_from_page(page
)
992 video_ids
.extend(ids_in_page
)
994 # Download any subsequent channel pages using the json-based channel_ajax query
995 if self
._MORE
_PAGES
_INDICATOR
in page
:
996 for pagenum
in itertools
.count(1):
997 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
998 page
= self
._download
_webpage
(url
, channel_id
,
999 u
'Downloading page #%s' % pagenum
)
1001 page
= json
.loads(page
)
1003 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1004 video_ids
.extend(ids_in_page
)
1006 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1009 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1011 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
1012 url_entries
= [self
.url_result(eurl
, 'Youtube') for eurl
in urls
]
1013 return [self
.playlist_result(url_entries
, channel_id
)]
1016 class YoutubeUserIE(InfoExtractor
):
1017 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1018 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)([A-Za-z0-9_-]+)'
1019 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1020 _GDATA_PAGE_SIZE
= 50
1021 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1022 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1023 IE_NAME
= u
'youtube:user'
1025 def suitable(cls
, url
):
1026 if YoutubeIE
.suitable(url
): return False
1027 else: return super(YoutubeUserIE
, cls
).suitable(url
)
1029 def _real_extract(self
, url
):
1031 mobj
= re
.match(self
._VALID
_URL
, url
)
1033 raise ExtractorError(u
'Invalid URL: %s' % url
)
1035 username
= mobj
.group(1)
1037 # Download video ids using YouTube Data API. Result size per
1038 # query is limited (currently to 50 videos) so we need to query
1039 # page by page until there are no video ids - it means we got
1044 for pagenum
in itertools
.count(0):
1045 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1047 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1048 page
= self
._download
_webpage
(gdata_url
, username
,
1049 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1051 # Extract video identifiers
1054 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1055 if mobj
.group(1) not in ids_in_page
:
1056 ids_in_page
.append(mobj
.group(1))
1058 video_ids
.extend(ids_in_page
)
1060 # A little optimization - if current page is not
1061 # "full", ie. does not contain PAGE_SIZE video ids then
1062 # we can assume that this page is the last one - there
1063 # are no more ids on further pages - no need to query
1066 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1069 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1070 url_results
= [self
.url_result(rurl
, 'Youtube') for rurl
in urls
]
1071 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1073 class YoutubeSearchIE(SearchInfoExtractor
):
1074 IE_DESC
= u
'YouTube.com searches'
1075 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1077 IE_NAME
= u
'youtube:search'
1078 _SEARCH_KEY
= 'ytsearch'
1080 def report_download_page(self
, query
, pagenum
):
1081 """Report attempt to download search page with given number."""
1082 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1084 def _get_n_results(self
, query
, n
):
1085 """Get a specified number of results for a query"""
1091 while (50 * pagenum
) < limit
:
1092 self
.report_download_page(query
, pagenum
+1)
1093 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1094 request
= compat_urllib_request
.Request(result_url
)
1096 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1097 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1098 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1099 api_response
= json
.loads(data
)['data']
1101 if not 'items' in api_response
:
1102 raise ExtractorError(u
'[youtube] No video results')
1104 new_ids
= list(video
['id'] for video
in api_response
['items'])
1105 video_ids
+= new_ids
1107 limit
= min(n
, api_response
['totalItems'])
1110 if len(video_ids
) > n
:
1111 video_ids
= video_ids
[:n
]
1112 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1113 return self
.playlist_result(videos
, query
)
1116 class YoutubeShowIE(InfoExtractor
):
1117 IE_DESC
= u
'YouTube.com (multi-season) shows'
1118 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1119 IE_NAME
= u
'youtube:show'
1121 def _real_extract(self
, url
):
1122 mobj
= re
.match(self
._VALID
_URL
, url
)
1123 show_name
= mobj
.group(1)
1124 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1125 # There's one playlist for each season of the show
1126 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1127 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1128 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1131 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1133 Base class for extractors that fetch info from
1134 http://www.youtube.com/feed_ajax
1135 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1137 _LOGIN_REQUIRED
= True
1139 # use action_load_personal_feed instead of action_load_system_feed
1140 _PERSONAL_FEED
= False
1143 def _FEED_TEMPLATE(self
):
1144 action
= 'action_load_system_feed'
1145 if self
._PERSONAL
_FEED
:
1146 action
= 'action_load_personal_feed'
1147 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1151 return u
'youtube:%s' % self
._FEED
_NAME
1153 def _real_initialize(self
):
1156 def _real_extract(self
, url
):
1158 # The step argument is available only in 2.7 or higher
1159 for i
in itertools
.count(0):
1160 paging
= i
*self
._PAGING
_STEP
1161 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1162 u
'%s feed' % self
._FEED
_NAME
,
1163 u
'Downloading page %s' % i
)
1164 info
= json
.loads(info
)
1165 feed_html
= info
['feed_html']
1166 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1167 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1168 feed_entries
.extend(self
.url_result(id, 'Youtube') for id in ids
)
1169 if info
['paging'] is None:
1171 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1173 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1174 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1175 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1176 _FEED_NAME
= 'subscriptions'
1177 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1179 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1180 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1181 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1182 _FEED_NAME
= 'recommended'
1183 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1185 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1186 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1187 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1188 _FEED_NAME
= 'watch_later'
1189 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1191 _PERSONAL_FEED
= True
1193 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1194 IE_NAME
= u
'youtube:favorites'
1195 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1196 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1197 _LOGIN_REQUIRED
= True
1199 def _real_extract(self
, url
):
1200 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1201 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1202 return self
.url_result(playlist_id
, 'YoutubePlaylist')