9 from .common
import InfoExtractor
, SearchInfoExtractor
15 compat_urllib_request
,
26 class YoutubeBaseInfoExtractor(InfoExtractor
):
27 """Provide base functions for Youtube extractors"""
28 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
29 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
30 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
31 _NETRC_MACHINE
= 'youtube'
32 # If True it will raise an error if no login info is provided
33 _LOGIN_REQUIRED
= False
35 def report_lang(self
):
36 """Report attempt to set language."""
37 self
.to_screen(u
'Setting language')
39 def _set_language(self
):
40 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
43 compat_urllib_request
.urlopen(request
).read()
44 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
45 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
50 (username
, password
) = self
._get
_login
_info
()
51 # No authentication to be performed
53 if self
._LOGIN
_REQUIRED
:
54 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
57 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
59 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
60 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
61 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
66 match
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
)
69 match
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
)
75 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 u
'PersistentCookie': u
'yes',
81 u
'bgresponse': u
'js_disabled',
82 u
'checkConnection': u
'',
83 u
'checkedDomains': u
'youtube',
89 u
'signIn': u
'Sign in',
91 u
'service': u
'youtube',
95 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
97 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
98 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
99 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
102 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
103 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
104 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
106 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
107 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
111 def _confirm_age(self
):
114 'action_confirm': 'Confirm',
116 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
118 self
.report_age_confirmation()
119 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
120 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
121 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
124 def _real_initialize(self
):
125 if self
._downloader
is None:
127 if not self
._set
_language
():
129 if not self
._login
():
133 class YoutubeIE(YoutubeBaseInfoExtractor
):
134 IE_DESC
= u
'YouTube.com'
137 (?:https?://)? # http(s):// (optional)
138 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
139 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
140 (?:.*?\#/)? # handle anchor (#/) redirect urls
141 (?: # the various things that can precede the ID:
142 (?:(?:v|embed|e)/) # v/ or embed/ or e/
143 |(?: # or the v= param in all its forms
144 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
145 (?:\?|\#!?) # the params delimiter ? or # or #!
146 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
149 )? # optional -> youtube.com/xxxx is OK
150 )? # all until now is optional -> you can pass the naked ID
151 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
152 (?(1).+)? # if we found the ID, everything can follow
154 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
155 # Listed in order of quality
156 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
158 '96', '95', '94', '93', '92', '132', '151',
160 '85', '84', '102', '83', '101', '82', '100',
162 '138', '137', '248', '136', '247', '135', '246',
163 '245', '244', '134', '243', '133', '242', '160',
165 '141', '172', '140', '171', '139',
167 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '102', '84', '101', '83', '100', '82',
173 '138', '248', '137', '247', '136', '246', '245',
174 '244', '135', '243', '134', '242', '133', '160',
176 '172', '141', '171', '140', '139',
178 _video_formats_map
= {
179 'flv': ['35', '34', '6', '5'],
180 '3gp': ['36', '17', '13'],
181 'mp4': ['38', '37', '22', '18'],
182 'webm': ['46', '45', '44', '43'],
184 _video_extensions
= {
238 _video_dimensions
= {
320 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
321 u
"file": u
"BaW_jenozKc.mp4",
323 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
324 u
"uploader": u
"Philipp Hagemeister",
325 u
"uploader_id": u
"phihag",
326 u
"upload_date": u
"20121002",
327 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
331 u
"url": u
"http://www.youtube.com/watch?v=1ltcDfZMA3U",
332 u
"file": u
"1ltcDfZMA3U.flv",
333 u
"note": u
"Test VEVO video (#897)",
335 u
"upload_date": u
"20070518",
336 u
"title": u
"Maps - It Will Find You",
337 u
"description": u
"Music video by Maps performing It Will Find You.",
338 u
"uploader": u
"MuteUSA",
339 u
"uploader_id": u
"MuteUSA"
343 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u
"file": u
"UxxajLWwzqY.mp4",
345 u
"note": u
"Test generic use_cipher_signature video (#897)",
347 u
"upload_date": u
"20120506",
348 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u
"description": u
"md5:3e2666e0a55044490499ea45fe9037b7",
350 u
"uploader": u
"Icona Pop",
351 u
"uploader_id": u
"IconaPop"
355 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u
"file": u
"07FYdnEawAQ.mp4",
357 u
"note": u
"Test VEVO video with age protection (#956)",
359 u
"upload_date": u
"20130703",
360 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
361 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
362 u
"uploader": u
"justintimberlakeVEVO",
363 u
"uploader_id": u
"justintimberlakeVEVO"
367 u
'url': u
'https://www.youtube.com/watch?v=TGi3HqYrWHE',
368 u
'file': u
'TGi3HqYrWHE.mp4',
369 u
'note': u
'm3u8 video',
371 u
'title': u
'Triathlon - Men - London 2012 Olympic Games',
372 u
'description': u
'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
373 u
'uploader': u
'olympic',
374 u
'upload_date': u
'20120807',
375 u
'uploader_id': u
'olympic',
378 u
'skip_download': True,
385 def suitable(cls
, url
):
386 """Receives a URL and returns True if suitable for this IE."""
387 if YoutubePlaylistIE
.suitable(url
) or YoutubeSubscriptionsIE
.suitable(url
): return False
388 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
390 def report_video_webpage_download(self
, video_id
):
391 """Report attempt to download video webpage."""
392 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
394 def report_video_info_webpage_download(self
, video_id
):
395 """Report attempt to download video info webpage."""
396 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
398 def report_video_subtitles_download(self
, video_id
):
399 """Report attempt to download video info webpage."""
400 self
.to_screen(u
'%s: Checking available subtitles' % video_id
)
402 def report_video_subtitles_request(self
, video_id
, sub_lang
, format
):
403 """Report attempt to download video info webpage."""
404 self
.to_screen(u
'%s: Downloading video subtitles for %s.%s' % (video_id
, sub_lang
, format
))
406 def report_video_subtitles_available(self
, video_id
, sub_lang_list
):
407 """Report available subtitles."""
408 sub_lang
= ",".join(list(sub_lang_list
.keys()))
409 self
.to_screen(u
'%s: Available subtitles for video: %s' % (video_id
, sub_lang
))
411 def report_information_extraction(self
, video_id
):
412 """Report attempt to extract video information."""
413 self
.to_screen(u
'%s: Extracting video information' % video_id
)
415 def report_unavailable_format(self
, video_id
, format
):
416 """Report extracted video URL."""
417 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
419 def report_rtmp_download(self
):
420 """Indicate the download will use the RTMP protocol."""
421 self
.to_screen(u
'RTMP download detected')
423 def _decrypt_signature(self
, s
):
424 """Turn the encrypted s field into a working signature"""
427 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
429 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
431 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
433 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
435 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
437 return s
[81:73:-1] + s
[84] + s
[72:58:-1] + s
[0] + s
[57:35:-1] + s
[85] + s
[34:0:-1]
439 return s
[83:34:-1] + s
[0] + s
[33:27:-1] + s
[3] + s
[26:19:-1] + s
[34] + s
[18:3:-1] + s
[27]
441 return s
[81:36:-1] + s
[0] + s
[35:2:-1]
443 return s
[81:64:-1] + s
[82] + s
[63:52:-1] + s
[45] + s
[51:45:-1] + s
[1] + s
[44:1:-1] + s
[0]
445 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:82]
447 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
449 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
451 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
454 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
456 def _decrypt_signature_age_gate(self
, s
):
457 # The videos with age protection use another player, so the algorithms
460 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
462 # Fallback to the other algortihms
463 return self
._decrypt
_signature
(s
)
466 def _get_available_subtitles(self
, video_id
):
467 self
.report_video_subtitles_download(video_id
)
468 request
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
470 sub_list
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
471 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
472 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
474 sub_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
475 sub_lang_list
= dict((l
[1], l
[0]) for l
in sub_lang_list
)
476 if not sub_lang_list
:
477 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
481 def _list_available_subtitles(self
, video_id
):
482 sub_lang_list
= self
._get
_available
_subtitles
(video_id
)
483 self
.report_video_subtitles_available(video_id
, sub_lang_list
)
485 def _request_subtitle(self
, sub_lang
, sub_name
, video_id
, format
):
487 Return the subtitle as a string or None if they are not found
489 self
.report_video_subtitles_request(video_id
, sub_lang
, format
)
490 params
= compat_urllib_parse
.urlencode({
496 url
= 'http://www.youtube.com/api/timedtext?' + params
498 sub
= compat_urllib_request
.urlopen(url
).read().decode('utf-8')
499 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
500 self
._downloader
.report_warning(u
'unable to download video subtitles for %s: %s' % (sub_lang
, compat_str(err
)))
503 self
._downloader
.report_warning(u
'Did not fetch video subtitles')
507 def _request_automatic_caption(self
, video_id
, webpage
):
508 """We need the webpage for getting the captions url, pass it as an
509 argument to speed up the process."""
510 sub_lang
= (self
._downloader
.params
.get('subtitleslangs') or ['en'])[0]
511 sub_format
= self
._downloader
.params
.get('subtitlesformat')
512 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
513 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
514 err_msg
= u
'Couldn\'t find automatic captions for "%s"' % sub_lang
516 self
._downloader
.report_warning(err_msg
)
518 player_config
= json
.loads(mobj
.group(1))
520 args
= player_config
[u
'args']
521 caption_url
= args
[u
'ttsurl']
522 timestamp
= args
[u
'timestamp']
523 params
= compat_urllib_parse
.urlencode({
530 subtitles_url
= caption_url
+ '&' + params
531 sub
= self
._download
_webpage
(subtitles_url
, video_id
, u
'Downloading automatic captions')
532 return {sub_lang: sub}
533 # An extractor error can be raise by the download process if there are
534 # no automatic captions but there are subtitles
535 except (KeyError, ExtractorError
):
536 self
._downloader
.report_warning(err_msg
)
539 def _extract_subtitles(self
, video_id
):
541 Return a dictionary: {language: subtitles} or {} if the subtitles
544 available_subs_list
= self
._get
_available
_subtitles
(video_id
)
545 sub_format
= self
._downloader
.params
.get('subtitlesformat')
546 if not available_subs_list
: #There was some error, it didn't get the available subtitles
548 if self
._downloader
.params
.get('allsubtitles', False):
549 sub_lang_list
= available_subs_list
551 if self
._downloader
.params
.get('subtitleslangs', False):
552 reqested_langs
= self
._downloader
.params
.get('subtitleslangs')
553 elif 'en' in available_subs_list
:
554 reqested_langs
= ['en']
556 reqested_langs
= [list(available_subs_list
.keys())[0]]
559 for sub_lang
in reqested_langs
:
560 if not sub_lang
in available_subs_list
:
561 self
._downloader
.report_warning(u
'no closed captions found in the specified language "%s"' % sub_lang
)
563 sub_lang_list
[sub_lang
] = available_subs_list
[sub_lang
]
565 for sub_lang
in sub_lang_list
:
566 subtitle
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
)
568 subtitles
[sub_lang
] = subtitle
571 def _print_formats(self
, formats
):
572 print('Available formats:')
574 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
575 self
._video
_dimensions
.get(x
, '???'),
576 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
578 def _extract_id(self
, url
):
579 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
581 raise ExtractorError(u
'Invalid URL: %s' % url
)
582 video_id
= mobj
.group(2)
585 def _get_video_url_list(self
, url_map
):
587 Transform a dictionary in the format {itag:url} to a list of (itag, url)
588 with the requested formats.
590 req_format
= self
._downloader
.params
.get('format', None)
591 format_limit
= self
._downloader
.params
.get('format_limit', None)
592 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
593 if format_limit
is not None and format_limit
in available_formats
:
594 format_list
= available_formats
[available_formats
.index(format_limit
):]
596 format_list
= available_formats
597 existing_formats
= [x
for x
in format_list
if x
in url_map
]
598 if len(existing_formats
) == 0:
599 raise ExtractorError(u
'no known formats available for video')
600 if self
._downloader
.params
.get('listformats', None):
601 self
._print
_formats
(existing_formats
)
603 if req_format
is None or req_format
== 'best':
604 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
605 elif req_format
== 'worst':
606 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
607 elif req_format
in ('-1', 'all'):
608 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
610 # Specific formats. We pick the first in a slash-delimeted sequence.
611 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
612 # available in the specified format. For example,
613 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
614 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
615 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
616 req_formats
= req_format
.split('/')
617 video_url_list
= None
618 for rf
in req_formats
:
620 video_url_list
= [(rf
, url_map
[rf
])]
622 if rf
in self
._video
_formats
_map
:
623 for srf
in self
._video
_formats
_map
[rf
]:
625 video_url_list
= [(srf
, url_map
[srf
])]
630 if video_url_list
is None:
631 raise ExtractorError(u
'requested format not available')
632 return video_url_list
634 def _extract_from_m3u8(self
, manifest_url
, video_id
):
636 def _get_urls(_manifest
):
637 lines
= _manifest
.split('\n')
638 urls
= filter(lambda l
: l
and not l
.startswith('#'),
641 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
642 formats_urls
= _get_urls(manifest
)
643 for format_url
in formats_urls
:
644 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
645 url_map
[itag
] = format_url
648 def _real_extract(self
, url
):
649 if re
.match(r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url
):
650 self
._downloader
.report_warning(u
'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
652 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
653 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
655 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
656 video_id
= self
._extract
_id
(url
)
659 self
.report_video_webpage_download(video_id
)
660 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
661 request
= compat_urllib_request
.Request(url
)
663 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
664 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
665 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
667 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
669 # Attempt to extract SWF player URL
670 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
672 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
677 self
.report_video_info_webpage_download(video_id
)
678 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
679 self
.report_age_confirmation()
681 # We simulate the access to the video from www.youtube.com/v/{video_id}
682 # this can be viewed without login into Youtube
683 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
687 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
691 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
692 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
694 errnote
='unable to download video info webpage')
695 video_info
= compat_parse_qs(video_info_webpage
)
698 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
699 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
700 % (video_id
, el_type
))
701 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
703 errnote
='unable to download video info webpage')
704 video_info
= compat_parse_qs(video_info_webpage
)
705 if 'token' in video_info
:
707 if 'token' not in video_info
:
708 if 'reason' in video_info
:
709 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
711 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
713 # Check for "rental" videos
714 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
715 raise ExtractorError(u
'"rental" videos not supported')
717 # Start extracting information
718 self
.report_information_extraction(video_id
)
721 if 'author' not in video_info
:
722 raise ExtractorError(u
'Unable to extract uploader name')
723 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
726 video_uploader_id
= None
727 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
729 video_uploader_id
= mobj
.group(1)
731 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
734 if 'title' not in video_info
:
735 raise ExtractorError(u
'Unable to extract video title')
736 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
739 # We try first to get a high quality image:
740 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
741 video_webpage
, re
.DOTALL
)
742 if m_thumb
is not None:
743 video_thumbnail
= m_thumb
.group(1)
744 elif 'thumbnail_url' not in video_info
:
745 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
747 else: # don't panic if we can't find it
748 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
752 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
754 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
755 upload_date
= unified_strdate(upload_date
)
758 video_description
= get_element_by_id("eow-description", video_webpage
)
759 if video_description
:
760 video_description
= clean_html(video_description
)
762 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
764 video_description
= unescapeHTML(fd_mobj
.group(1))
766 video_description
= u
''
769 video_subtitles
= None
771 if self
._downloader
.params
.get('writesubtitles', False) or self
._downloader
.params
.get('allsubtitles', False):
772 video_subtitles
= self
._extract
_subtitles
(video_id
)
773 elif self
._downloader
.params
.get('writeautomaticsub', False):
774 video_subtitles
= self
._request
_automatic
_caption
(video_id
, video_webpage
)
776 if self
._downloader
.params
.get('listsubtitles', False):
777 self
._list
_available
_subtitles
(video_id
)
780 if 'length_seconds' not in video_info
:
781 self
._downloader
.report_warning(u
'unable to extract video duration')
784 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
786 # Decide which formats to download
789 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
791 raise ValueError('Could not find vevo ID')
792 info
= json
.loads(mobj
.group(1))
794 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
795 # this signatures are encrypted
796 m_s
= re
.search(r
'[&,]s=', args
['url_encoded_fmt_stream_map'])
798 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
799 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
800 m_s
= re
.search(r
'[&,]s=', args
.get('adaptive_fmts', u
''))
802 if 'url_encoded_fmt_stream_map' in video_info
:
803 video_info
['url_encoded_fmt_stream_map'][0] += ',' + args
['adaptive_fmts']
805 video_info
['url_encoded_fmt_stream_map'] = [args
['adaptive_fmts']]
806 elif 'adaptive_fmts' in video_info
:
807 if 'url_encoded_fmt_stream_map' in video_info
:
808 video_info
['url_encoded_fmt_stream_map'][0] += ',' + video_info
['adaptive_fmts'][0]
810 video_info
['url_encoded_fmt_stream_map'] = video_info
['adaptive_fmts']
814 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
815 self
.report_rtmp_download()
816 video_url_list
= [(None, video_info
['conn'][0])]
817 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
818 if 'rtmpe%3Dyes' in video_info
['url_encoded_fmt_stream_map'][0]:
819 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
821 for url_data_str
in video_info
['url_encoded_fmt_stream_map'][0].split(','):
822 url_data
= compat_parse_qs(url_data_str
)
823 if 'itag' in url_data
and 'url' in url_data
:
824 url
= url_data
['url'][0]
825 if 'sig' in url_data
:
826 url
+= '&signature=' + url_data
['sig'][0]
827 elif 's' in url_data
:
828 if self
._downloader
.params
.get('verbose'):
831 player_version
= self
._search
_regex
(r
'ad3-(.+?)\.swf',
832 video_info
['ad3_module'][0] if 'ad3_module' in video_info
else 'NOT FOUND',
833 'flash player', fatal
=False)
834 player
= 'flash player %s' % player_version
836 player
= u
'html5 player %s' % self
._search
_regex
(r
'html5player-(.+?)\.js', video_webpage
,
837 'html5 player', fatal
=False)
838 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in s
.split('.'))
839 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
840 (len(s
), parts_sizes
, url_data
['itag'][0], player
))
841 encrypted_sig
= url_data
['s'][0]
843 signature
= self
._decrypt
_signature
_age
_gate
(encrypted_sig
)
845 signature
= self
._decrypt
_signature
(encrypted_sig
)
846 url
+= '&signature=' + signature
847 if 'ratebypass' not in url
:
848 url
+= '&ratebypass=yes'
849 url_map
[url_data
['itag'][0]] = url
850 video_url_list
= self
._get
_video
_url
_list
(url_map
)
851 if not video_url_list
:
853 elif video_info
.get('hlsvp'):
854 manifest_url
= video_info
['hlsvp'][0]
855 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
856 video_url_list
= self
._get
_video
_url
_list
(url_map
)
857 if not video_url_list
:
861 raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info')
864 for format_param
, video_real_url
in video_url_list
:
866 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
868 video_format
= '{0} - {1}{2}'.format(format_param
if format_param
else video_extension
,
869 self
._video
_dimensions
.get(format_param
, '???'),
870 ' ('+self
._special
_itags
[format_param
]+')' if format_param
in self
._special
_itags
else '')
874 'url': video_real_url
,
875 'uploader': video_uploader
,
876 'uploader_id': video_uploader_id
,
877 'upload_date': upload_date
,
878 'title': video_title
,
879 'ext': video_extension
,
880 'format': video_format
,
881 'thumbnail': video_thumbnail
,
882 'description': video_description
,
883 'player_url': player_url
,
884 'subtitles': video_subtitles
,
885 'duration': video_duration
889 class YoutubePlaylistIE(InfoExtractor
):
890 IE_DESC
= u
'YouTube.com playlists'
896 (?:course|view_play_list|my_playlists|artist|playlist|watch)
897 \? (?:.*?&)*? (?:p|a|list)=
900 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
903 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
905 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
907 IE_NAME
= u
'youtube:playlist'
910 def suitable(cls
, url
):
911 """Receives a URL and returns True if suitable for this IE."""
912 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
914 def _real_extract(self
, url
):
915 # Extract playlist id
916 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
918 raise ExtractorError(u
'Invalid URL: %s' % url
)
920 # Download playlist videos from API
921 playlist_id
= mobj
.group(1) or mobj
.group(2)
924 for page_num
in itertools
.count(1):
925 start_index
= self
._MAX
_RESULTS
* (page_num
- 1) + 1
926 if start_index
>= 1000:
927 self
._downloader
.report_warning(u
'Max number of results reached')
929 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, start_index
)
930 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
933 response
= json
.loads(page
)
934 except ValueError as err
:
935 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
937 if 'feed' not in response
:
938 raise ExtractorError(u
'Got a malformed response from YouTube API')
939 playlist_title
= response
['feed']['title']['$t']
940 if 'entry' not in response
['feed']:
941 # Number of videos is a multiple of self._MAX_RESULTS
944 for entry
in response
['feed']['entry']:
945 index
= entry
['yt$position']['$t']
946 if 'media$group' in entry
and 'media$player' in entry
['media$group']:
947 videos
.append((index
, entry
['media$group']['media$player']['url']))
949 videos
= [v
[1] for v
in sorted(videos
)]
951 url_results
= [self
.url_result(vurl
, 'Youtube') for vurl
in videos
]
952 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
955 class YoutubeChannelIE(InfoExtractor
):
956 IE_DESC
= u
'YouTube.com channels'
957 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
958 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
959 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
960 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
961 IE_NAME
= u
'youtube:channel'
963 def extract_videos_from_page(self
, page
):
965 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
966 if mobj
.group(1) not in ids_in_page
:
967 ids_in_page
.append(mobj
.group(1))
970 def _real_extract(self
, url
):
972 mobj
= re
.match(self
._VALID
_URL
, url
)
974 raise ExtractorError(u
'Invalid URL: %s' % url
)
976 # Download channel page
977 channel_id
= mobj
.group(1)
981 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
982 page
= self
._download
_webpage
(url
, channel_id
,
983 u
'Downloading page #%s' % pagenum
)
985 # Extract video identifiers
986 ids_in_page
= self
.extract_videos_from_page(page
)
987 video_ids
.extend(ids_in_page
)
989 # Download any subsequent channel pages using the json-based channel_ajax query
990 if self
._MORE
_PAGES
_INDICATOR
in page
:
991 for pagenum
in itertools
.count(1):
992 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
993 page
= self
._download
_webpage
(url
, channel_id
,
994 u
'Downloading page #%s' % pagenum
)
996 page
= json
.loads(page
)
998 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
999 video_ids
.extend(ids_in_page
)
1001 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1004 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1006 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
1007 url_entries
= [self
.url_result(eurl
, 'Youtube') for eurl
in urls
]
1008 return [self
.playlist_result(url_entries
, channel_id
)]
1011 class YoutubeUserIE(InfoExtractor
):
1012 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1013 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1014 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1015 _GDATA_PAGE_SIZE
= 50
1016 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1017 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1018 IE_NAME
= u
'youtube:user'
1020 def _real_extract(self
, url
):
1022 mobj
= re
.match(self
._VALID
_URL
, url
)
1024 raise ExtractorError(u
'Invalid URL: %s' % url
)
1026 username
= mobj
.group(1)
1028 # Download video ids using YouTube Data API. Result size per
1029 # query is limited (currently to 50 videos) so we need to query
1030 # page by page until there are no video ids - it means we got
1035 for pagenum
in itertools
.count(0):
1036 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1038 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1039 page
= self
._download
_webpage
(gdata_url
, username
,
1040 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1042 # Extract video identifiers
1045 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1046 if mobj
.group(1) not in ids_in_page
:
1047 ids_in_page
.append(mobj
.group(1))
1049 video_ids
.extend(ids_in_page
)
1051 # A little optimization - if current page is not
1052 # "full", ie. does not contain PAGE_SIZE video ids then
1053 # we can assume that this page is the last one - there
1054 # are no more ids on further pages - no need to query
1057 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1060 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1061 url_results
= [self
.url_result(rurl
, 'Youtube') for rurl
in urls
]
1062 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1064 class YoutubeSearchIE(SearchInfoExtractor
):
1065 IE_DESC
= u
'YouTube.com searches'
1066 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1068 IE_NAME
= u
'youtube:search'
1069 _SEARCH_KEY
= 'ytsearch'
1071 def report_download_page(self
, query
, pagenum
):
1072 """Report attempt to download search page with given number."""
1073 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1075 def _get_n_results(self
, query
, n
):
1076 """Get a specified number of results for a query"""
1082 while (50 * pagenum
) < limit
:
1083 self
.report_download_page(query
, pagenum
+1)
1084 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1085 request
= compat_urllib_request
.Request(result_url
)
1087 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1088 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1089 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1090 api_response
= json
.loads(data
)['data']
1092 if not 'items' in api_response
:
1093 raise ExtractorError(u
'[youtube] No video results')
1095 new_ids
= list(video
['id'] for video
in api_response
['items'])
1096 video_ids
+= new_ids
1098 limit
= min(n
, api_response
['totalItems'])
1101 if len(video_ids
) > n
:
1102 video_ids
= video_ids
[:n
]
1103 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1104 return self
.playlist_result(videos
, query
)
1107 class YoutubeShowIE(InfoExtractor
):
1108 IE_DESC
= u
'YouTube.com (multi-season) shows'
1109 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1110 IE_NAME
= u
'youtube:show'
1112 def _real_extract(self
, url
):
1113 mobj
= re
.match(self
._VALID
_URL
, url
)
1114 show_name
= mobj
.group(1)
1115 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1116 # There's one playlist for each season of the show
1117 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1118 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1119 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1122 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1124 Base class for extractors that fetch info from
1125 http://www.youtube.com/feed_ajax
1126 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1128 _LOGIN_REQUIRED
= True
1130 # use action_load_personal_feed instead of action_load_system_feed
1131 _PERSONAL_FEED
= False
1134 def _FEED_TEMPLATE(self
):
1135 action
= 'action_load_system_feed'
1136 if self
._PERSONAL
_FEED
:
1137 action
= 'action_load_personal_feed'
1138 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1142 return u
'youtube:%s' % self
._FEED
_NAME
1144 def _real_initialize(self
):
1147 def _real_extract(self
, url
):
1149 # The step argument is available only in 2.7 or higher
1150 for i
in itertools
.count(0):
1151 paging
= i
*self
._PAGING
_STEP
1152 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1153 u
'%s feed' % self
._FEED
_NAME
,
1154 u
'Downloading page %s' % i
)
1155 info
= json
.loads(info
)
1156 feed_html
= info
['feed_html']
1157 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1158 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1159 feed_entries
.extend(self
.url_result(id, 'Youtube') for id in ids
)
1160 if info
['paging'] is None:
1162 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1164 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1165 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1166 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1167 _FEED_NAME
= 'subscriptions'
1168 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1170 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1171 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1172 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1173 _FEED_NAME
= 'recommended'
1174 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1176 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1177 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1178 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1179 _FEED_NAME
= 'watch_later'
1180 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1182 _PERSONAL_FEED
= True
1184 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1185 IE_NAME
= u
'youtube:favorites'
1186 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1187 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1188 _LOGIN_REQUIRED
= True
1190 def _real_extract(self
, url
):
1191 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1192 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1193 return self
.url_result(playlist_id
, 'YoutubePlaylist')