youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import time
  11 import traceback
  12
  13 from .common import InfoExtractor, SearchInfoExtractor
  14 from .subtitles import SubtitlesInfoExtractor
  15 from ..jsinterp import JSInterpreter
  16 from ..swfinterp import SWFInterpreter
  17 from ..compat import (
  18     compat_chr,
  19     compat_parse_qs,
  20     compat_urllib_parse,
  21     compat_urllib_request,
  22     compat_urlparse,
  23     compat_str,
  24 )
  25 from ..utils import (
  26     clean_html,
  27     ExtractorError,
  28     float_or_none,
  29     get_element_by_attribute,
  30     get_element_by_id,
  31     int_or_none,
  32     OnDemandPagedList,
  33     orderedSet,
  34     unescapeHTML,
  35     unified_strdate,
  36     uppercase_escape,
  37 )
  38
  39
  40 class YoutubeBaseInfoExtractor(InfoExtractor):
  41     """Provide base functions for Youtube extractors"""
  42     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  43     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  44     _NETRC_MACHINE = 'youtube'
  45     # If True it will raise an error if no login info is provided
  46     _LOGIN_REQUIRED = False
  47
  48     def _set_language(self):
  49         self._set_cookie(
  50             '.youtube.com', 'PREF', 'f1=50000000&hl=en',
  51             # YouTube sets the expire time to about two months
  52             expire_time=time.time() + 2 * 30 * 24 * 3600)
  53
  54     def _login(self):
  55         """
  56         Attempt to log in to YouTube.
  57         True is returned if successful or skipped.
  58         False is returned if login failed.
  59
  60         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  61         """
  62         (username, password) = self._get_login_info()
  63         # No authentication to be performed
  64         if username is None:
  65             if self._LOGIN_REQUIRED:
  66                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  67             return True
  68
  69         login_page = self._download_webpage(
  70             self._LOGIN_URL, None,
  71             note='Downloading login page',
  72             errnote='unable to fetch login page', fatal=False)
  73         if login_page is False:
  74             return
  75
  76         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  77                                   login_page, 'Login GALX parameter')
  78
  79         # Log in
  80         login_form_strs = {
  81             'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  82             'Email': username,
  83             'GALX': galx,
  84             'Passwd': password,
  85
  86             'PersistentCookie': 'yes',
  87             '_utf8': '霱',
  88             'bgresponse': 'js_disabled',
  89             'checkConnection': '',
  90             'checkedDomains': 'youtube',
  91             'dnConn': '',
  92             'pstMsg': '0',
  93             'rmShown': '1',
  94             'secTok': '',
  95             'signIn': 'Sign in',
  96             'timeStmp': '',
  97             'service': 'youtube',
  98             'uilel': '3',
  99             'hl': 'en_US',
 100         }
 101
 102         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 103         # chokes on unicode
 104         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
 105         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 106
 107         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 108         login_results = self._download_webpage(
 109             req, None,
 110             note='Logging in', errnote='unable to log in', fatal=False)
 111         if login_results is False:
 112             return False
 113
 114         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 115             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 116
 117         # Two-Factor
 118         # TODO add SMS and phone call support - these require making a request and then prompting the user
 119
 120         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 121             tfa_code = self._get_tfa_info()
 122
 123             if tfa_code is None:
 124                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 125                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 126                 return False
 127
 128             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 129
 130             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 131             if match is None:
 132                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 133             secTok = match.group(1)
 134             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 135             if match is None:
 136                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 137             timeStmp = match.group(1)
 138
 139             tfa_form_strs = {
 140                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 141                 'smsToken': '',
 142                 'smsUserPin': tfa_code,
 143                 'smsVerifyPin': 'Verify',
 144
 145                 'PersistentCookie': 'yes',
 146                 'checkConnection': '',
 147                 'checkedDomains': 'youtube',
 148                 'pstMsg': '1',
 149                 'secTok': secTok,
 150                 'timeStmp': timeStmp,
 151                 'service': 'youtube',
 152                 'hl': 'en_US',
 153             }
 154             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
 155             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 156
 157             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 158             tfa_results = self._download_webpage(
 159                 tfa_req, None,
 160                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 161
 162             if tfa_results is False:
 163                 return False
 164
 165             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 166                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 167                 return False
 168             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 169                 self._downloader.report_warning('unable to log in - did the page structure change?')
 170                 return False
 171             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 172                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 173                 return False
 174
 175         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 176             self._downloader.report_warning('unable to log in: bad username or password')
 177             return False
 178         return True
 179
 180     def _real_initialize(self):
 181         if self._downloader is None:
 182             return
 183         self._set_language()
 184         if not self._login():
 185             return
 186
 187
 188 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 189     IE_DESC = 'YouTube.com'
 190     _VALID_URL = r"""(?x)^
 191                      (
 192                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 193                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 194                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 195                             (?:www\.)?pwnyoutube\.com/|
 196                             (?:www\.)?yourepeat\.com/|
 197                             tube\.majestyc\.net/|
 198                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 199                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 200                          (?:                                                  # the various things that can precede the ID:
 201                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 202                              |(?:                                             # or the v= param in all its forms
 203                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 204                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 205                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 206                                  v=
 207                              )
 208                          ))
 209                          |youtu\.be/                                          # just youtu.be/xxxx
 210                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 211                          )
 212                      )?                                                       # all until now is optional -> you can pass the naked ID
 213                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 214                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 215                      (?(1).+)?                                                # if we found the ID, everything can follow
 216                      $"""
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _formats = {
 219         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 220         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 221         '13': {'ext': '3gp'},
 222         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 223         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 224         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 225         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 226         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 227         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 228         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 229         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 230         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 231         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 232         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 233         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 234
 235
 236         # 3d videos
 237         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 238         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 239         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 240         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 241         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 242         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 243         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 244
 245         # Apple HTTP Live Streaming
 246         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 247         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 248         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 249         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 250         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 251         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 252         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 253
 254         # DASH mp4 video
 255         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 256         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 257         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 258         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 259         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 260         '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
 261         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 262         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 263         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 264         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 265         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 266
 267         # Dash mp4 audio
 268         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
 269         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
 270         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
 271
 272         # Dash webm
 273         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 274         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 275         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 276         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 277         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 278         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 279         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 280         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 281         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 282         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 283         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 284         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 285         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 286         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 287         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 288         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 289         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 290         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 291         '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 292         '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
 293         '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 294
 295         # Dash webm audio
 296         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 297         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 298
 299         # Dash webm audio with opus inside
 300         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 301         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 302         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 303
 304         # RTMP (unnamed)
 305         '_rtmp': {'protocol': 'rtmp'},
 306     }
 307
 308     IE_NAME = 'youtube'
 309     _TESTS = [
 310         {
 311             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 312             'info_dict': {
 313                 'id': 'BaW_jenozKc',
 314                 'ext': 'mp4',
 315                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 316                 'uploader': 'Philipp Hagemeister',
 317                 'uploader_id': 'phihag',
 318                 'upload_date': '20121002',
 319                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 320                 'categories': ['Science & Technology'],
 321                 'like_count': int,
 322                 'dislike_count': int,
 323             }
 324         },
 325         {
 326             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 327             'note': 'Test generic use_cipher_signature video (#897)',
 328             'info_dict': {
 329                 'id': 'UxxajLWwzqY',
 330                 'ext': 'mp4',
 331                 'upload_date': '20120506',
 332                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 333                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 334                 'uploader': 'Icona Pop',
 335                 'uploader_id': 'IconaPop',
 336             }
 337         },
 338         {
 339             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 340             'note': 'Test VEVO video with age protection (#956)',
 341             'info_dict': {
 342                 'id': '07FYdnEawAQ',
 343                 'ext': 'mp4',
 344                 'upload_date': '20130703',
 345                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 346                 'description': 'md5:64249768eec3bc4276236606ea996373',
 347                 'uploader': 'justintimberlakeVEVO',
 348                 'uploader_id': 'justintimberlakeVEVO',
 349             }
 350         },
 351         {
 352             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 353             'note': 'Embed-only video (#1746)',
 354             'info_dict': {
 355                 'id': 'yZIXLfi8CZQ',
 356                 'ext': 'mp4',
 357                 'upload_date': '20120608',
 358                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 359                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 360                 'uploader': 'SET India',
 361                 'uploader_id': 'setindia'
 362             }
 363         },
 364         {
 365             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 366             'note': '256k DASH audio (format 141) via DASH manifest',
 367             'info_dict': {
 368                 'id': 'a9LDPn-MO4I',
 369                 'ext': 'm4a',
 370                 'upload_date': '20121002',
 371                 'uploader_id': '8KVIDEO',
 372                 'description': '',
 373                 'uploader': '8KVIDEO',
 374                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 375             },
 376             'params': {
 377                 'youtube_include_dash_manifest': True,
 378                 'format': '141',
 379             },
 380         },
 381         # DASH manifest with encrypted signature
 382         {
 383             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 384             'info_dict': {
 385                 'id': 'IB3lcPjvWLA',
 386                 'ext': 'm4a',
 387                 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
 388                 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
 389                 'uploader': 'AfrojackVEVO',
 390                 'uploader_id': 'AfrojackVEVO',
 391                 'upload_date': '20131011',
 392             },
 393             'params': {
 394                 'youtube_include_dash_manifest': True,
 395                 'format': '141',
 396             },
 397         },
 398         # JS player signature function name containing $
 399         {
 400             'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
 401             'info_dict': {
 402                 'id': 'nfWlot6h_JM',
 403                 'ext': 'm4a',
 404                 'title': 'Taylor Swift - Shake It Off',
 405                 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
 406                 'uploader': 'TaylorSwiftVEVO',
 407                 'uploader_id': 'TaylorSwiftVEVO',
 408                 'upload_date': '20140818',
 409             },
 410             'params': {
 411                 'youtube_include_dash_manifest': True,
 412                 'format': '141',
 413             },
 414         },
 415         # Controversy video
 416         {
 417             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 418             'info_dict': {
 419                 'id': 'T4XJQO3qol8',
 420                 'ext': 'mp4',
 421                 'upload_date': '20100909',
 422                 'uploader': 'The Amazing Atheist',
 423                 'uploader_id': 'TheAmazingAtheist',
 424                 'title': 'Burning Everyone\'s Koran',
 425                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 426             }
 427         },
 428         # Normal age-gate video (No vevo, embed allowed)
 429         {
 430             'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
 431             'info_dict': {
 432                 'id': 'HtVdAasjOgU',
 433                 'ext': 'mp4',
 434                 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
 435                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
 436                 'uploader': 'The Witcher',
 437                 'uploader_id': 'WitcherGame',
 438                 'upload_date': '20140605',
 439             },
 440         },
 441         # Age-gate video with encrypted signature
 442         {
 443             'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
 444             'info_dict': {
 445                 'id': '6kLq3WMV1nU',
 446                 'ext': 'mp4',
 447                 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
 448                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
 449                 'uploader': 'LloydVEVO',
 450                 'uploader_id': 'LloydVEVO',
 451                 'upload_date': '20110629',
 452             },
 453         },
 454         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
 455         {
 456             'url': '__2ABJjxzNo',
 457             'info_dict': {
 458                 'id': '__2ABJjxzNo',
 459                 'ext': 'mp4',
 460                 'upload_date': '20100430',
 461                 'uploader_id': 'deadmau5',
 462                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
 463                 'uploader': 'deadmau5',
 464                 'title': 'Deadmau5 - Some Chords (HD)',
 465             },
 466             'expected_warnings': [
 467                 'DASH manifest missing',
 468             ]
 469         },
 470         # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
 471         {
 472             'url': 'lqQg6PlCWgI',
 473             'info_dict': {
 474                 'id': 'lqQg6PlCWgI',
 475                 'ext': 'mp4',
 476                 'upload_date': '20120731',
 477                 'uploader_id': 'olympic',
 478                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
 479                 'uploader': 'Olympics',
 480                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
 481             },
 482             'params': {
 483                 'skip_download': 'requires avconv',
 484             }
 485         },
 486         # Non-square pixels
 487         {
 488             'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
 489             'info_dict': {
 490                 'id': '_b-2C3KPAM0',
 491                 'ext': 'mp4',
 492                 'stretched_ratio': 16 / 9.,
 493                 'upload_date': '20110310',
 494                 'uploader_id': 'AllenMeow',
 495                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
 496                 'uploader': '孫艾倫',
 497                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
 498             },
 499         }
 500     ]
 501
 502     def __init__(self, *args, **kwargs):
 503         super(YoutubeIE, self).__init__(*args, **kwargs)
 504         self._player_cache = {}
 505
 506     def report_video_info_webpage_download(self, video_id):
 507         """Report attempt to download video info webpage."""
 508         self.to_screen('%s: Downloading video info webpage' % video_id)
 509
 510     def report_information_extraction(self, video_id):
 511         """Report attempt to extract video information."""
 512         self.to_screen('%s: Extracting video information' % video_id)
 513
 514     def report_unavailable_format(self, video_id, format):
 515         """Report extracted video URL."""
 516         self.to_screen('%s: Format %s not available' % (video_id, format))
 517
 518     def report_rtmp_download(self):
 519         """Indicate the download will use the RTMP protocol."""
 520         self.to_screen('RTMP download detected')
 521
 522     def _signature_cache_id(self, example_sig):
 523         """ Return a string representation of a signature """
 524         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 525
 526     def _extract_signature_function(self, video_id, player_url, example_sig):
 527         id_m = re.match(
 528             r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 529             player_url)
 530         if not id_m:
 531             raise ExtractorError('Cannot identify player %r' % player_url)
 532         player_type = id_m.group('ext')
 533         player_id = id_m.group('id')
 534
 535         # Read from filesystem cache
 536         func_id = '%s_%s_%s' % (
 537             player_type, player_id, self._signature_cache_id(example_sig))
 538         assert os.path.basename(func_id) == func_id
 539
 540         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 541         if cache_spec is not None:
 542             return lambda s: ''.join(s[i] for i in cache_spec)
 543
 544         if player_type == 'js':
 545             code = self._download_webpage(
 546                 player_url, video_id,
 547                 note='Downloading %s player %s' % (player_type, player_id),
 548                 errnote='Download of %s failed' % player_url)
 549             res = self._parse_sig_js(code)
 550         elif player_type == 'swf':
 551             urlh = self._request_webpage(
 552                 player_url, video_id,
 553                 note='Downloading %s player %s' % (player_type, player_id),
 554                 errnote='Download of %s failed' % player_url)
 555             code = urlh.read()
 556             res = self._parse_sig_swf(code)
 557         else:
 558             assert False, 'Invalid player type %r' % player_type
 559
 560         if cache_spec is None:
 561             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 562             cache_res = res(test_string)
 563             cache_spec = [ord(c) for c in cache_res]
 564
 565         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 566         return res
 567
 568     def _print_sig_code(self, func, example_sig):
 569         def gen_sig_code(idxs):
 570             def _genslice(start, end, step):
 571                 starts = '' if start == 0 else str(start)
 572                 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
 573                 steps = '' if step == 1 else (':%d' % step)
 574                 return 's[%s%s%s]' % (starts, ends, steps)
 575
 576             step = None
 577             # Quelch pyflakes warnings - start will be set when step is set
 578             start = '(Never used)'
 579             for i, prev in zip(idxs[1:], idxs[:-1]):
 580                 if step is not None:
 581                     if i - prev == step:
 582                         continue
 583                     yield _genslice(start, prev, step)
 584                     step = None
 585                     continue
 586                 if i - prev in [-1, 1]:
 587                     step = i - prev
 588                     start = prev
 589                     continue
 590                 else:
 591                     yield 's[%d]' % prev
 592             if step is None:
 593                 yield 's[%d]' % i
 594             else:
 595                 yield _genslice(start, i, step)
 596
 597         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 598         cache_res = func(test_string)
 599         cache_spec = [ord(c) for c in cache_res]
 600         expr_code = ' + '.join(gen_sig_code(cache_spec))
 601         signature_id_tuple = '(%s)' % (
 602             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 603         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 604                 '    return %s\n') % (signature_id_tuple, expr_code)
 605         self.to_screen('Extracted signature function:\n' + code)
 606
 607     def _parse_sig_js(self, jscode):
 608         funcname = self._search_regex(
 609             r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
 610             'Initial JS player signature function name')
 611
 612         jsi = JSInterpreter(jscode)
 613         initial_function = jsi.extract_function(funcname)
 614         return lambda s: initial_function([s])
 615
 616     def _parse_sig_swf(self, file_contents):
 617         swfi = SWFInterpreter(file_contents)
 618         TARGET_CLASSNAME = 'SignatureDecipher'
 619         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 620         initial_function = swfi.extract_function(searched_class, 'decipher')
 621         return lambda s: initial_function([s])
 622
 623     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 624         """Turn the encrypted s field into a working signature"""
 625
 626         if player_url is None:
 627             raise ExtractorError('Cannot decrypt signature without player_url')
 628
 629         if player_url.startswith('//'):
 630             player_url = 'https:' + player_url
 631         try:
 632             player_id = (player_url, self._signature_cache_id(s))
 633             if player_id not in self._player_cache:
 634                 func = self._extract_signature_function(
 635                     video_id, player_url, s
 636                 )
 637                 self._player_cache[player_id] = func
 638             func = self._player_cache[player_id]
 639             if self._downloader.params.get('youtube_print_sig_code'):
 640                 self._print_sig_code(func, s)
 641             return func(s)
 642         except Exception as e:
 643             tb = traceback.format_exc()
 644             raise ExtractorError(
 645                 'Signature extraction failed: ' + tb, cause=e)
 646
 647     def _get_available_subtitles(self, video_id, webpage):
 648         try:
 649             subs_doc = self._download_xml(
 650                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 651                 video_id, note=False)
 652         except ExtractorError as err:
 653             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 654             return {}
 655
 656         sub_lang_list = {}
 657         for track in subs_doc.findall('track'):
 658             lang = track.attrib['lang_code']
 659             if lang in sub_lang_list:
 660                 continue
 661             params = compat_urllib_parse.urlencode({
 662                 'lang': lang,
 663                 'v': video_id,
 664                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 665                 'name': track.attrib['name'].encode('utf-8'),
 666             })
 667             url = 'https://www.youtube.com/api/timedtext?' + params
 668             sub_lang_list[lang] = url
 669         if not sub_lang_list:
 670             self._downloader.report_warning('video doesn\'t have subtitles')
 671             return {}
 672         return sub_lang_list
 673
 674     def _get_available_automatic_caption(self, video_id, webpage):
 675         """We need the webpage for getting the captions url, pass it as an
 676            argument to speed up the process."""
 677         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 678         self.to_screen('%s: Looking for automatic captions' % video_id)
 679         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 680         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 681         if mobj is None:
 682             self._downloader.report_warning(err_msg)
 683             return {}
 684         player_config = json.loads(mobj.group(1))
 685         try:
 686             args = player_config['args']
 687             caption_url = args['ttsurl']
 688             timestamp = args['timestamp']
 689             # We get the available subtitles
 690             list_params = compat_urllib_parse.urlencode({
 691                 'type': 'list',
 692                 'tlangs': 1,
 693                 'asrs': 1,
 694             })
 695             list_url = caption_url + '&' + list_params
 696             caption_list = self._download_xml(list_url, video_id)
 697             original_lang_node = caption_list.find('track')
 698             if original_lang_node is None:
 699                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 700                 return {}
 701             original_lang = original_lang_node.attrib['lang_code']
 702             caption_kind = original_lang_node.attrib.get('kind', '')
 703
 704             sub_lang_list = {}
 705             for lang_node in caption_list.findall('target'):
 706                 sub_lang = lang_node.attrib['lang_code']
 707                 params = compat_urllib_parse.urlencode({
 708                     'lang': original_lang,
 709                     'tlang': sub_lang,
 710                     'fmt': sub_format,
 711                     'ts': timestamp,
 712                     'kind': caption_kind,
 713                 })
 714                 sub_lang_list[sub_lang] = caption_url + '&' + params
 715             return sub_lang_list
 716         # An extractor error can be raise by the download process if there are
 717         # no automatic captions but there are subtitles
 718         except (KeyError, ExtractorError):
 719             self._downloader.report_warning(err_msg)
 720             return {}
 721
 722     @classmethod
 723     def extract_id(cls, url):
 724         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 725         if mobj is None:
 726             raise ExtractorError('Invalid URL: %s' % url)
 727         video_id = mobj.group(2)
 728         return video_id
 729
 730     def _extract_from_m3u8(self, manifest_url, video_id):
 731         url_map = {}
 732
 733         def _get_urls(_manifest):
 734             lines = _manifest.split('\n')
 735             urls = filter(lambda l: l and not l.startswith('#'),
 736                           lines)
 737             return urls
 738         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 739         formats_urls = _get_urls(manifest)
 740         for format_url in formats_urls:
 741             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 742             url_map[itag] = format_url
 743         return url_map
 744
 745     def _extract_annotations(self, video_id):
 746         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 747         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 748
 749     def _parse_dash_manifest(
 750             self, video_id, dash_manifest_url, player_url, age_gate):
 751         def decrypt_sig(mobj):
 752             s = mobj.group(1)
 753             dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 754             return '/signature/%s' % dec_s
 755         dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 756         dash_doc = self._download_xml(
 757             dash_manifest_url, video_id,
 758             note='Downloading DASH manifest',
 759             errnote='Could not download DASH manifest')
 760
 761         formats = []
 762         for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 763             url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 764             if url_el is None:
 765                 continue
 766             format_id = r.attrib['id']
 767             video_url = url_el.text
 768             filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 769             f = {
 770                 'format_id': format_id,
 771                 'url': video_url,
 772                 'width': int_or_none(r.attrib.get('width')),
 773                 'height': int_or_none(r.attrib.get('height')),
 774                 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 775                 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 776                 'filesize': filesize,
 777                 'fps': int_or_none(r.attrib.get('frameRate')),
 778             }
 779             try:
 780                 existing_format = next(
 781                     fo for fo in formats
 782                     if fo['format_id'] == format_id)
 783             except StopIteration:
 784                 full_info = self._formats.get(format_id, {}).copy()
 785                 full_info.update(f)
 786                 formats.append(full_info)
 787             else:
 788                 existing_format.update(f)
 789         return formats
 790
 791     def _real_extract(self, url):
 792         proto = (
 793             'http' if self._downloader.params.get('prefer_insecure', False)
 794             else 'https')
 795
 796         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 797         mobj = re.search(self._NEXT_URL_RE, url)
 798         if mobj:
 799             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 800         video_id = self.extract_id(url)
 801
 802         # Get video webpage
 803         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 804         video_webpage = self._download_webpage(url, video_id)
 805
 806         # Attempt to extract SWF player URL
 807         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 808         if mobj is not None:
 809             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 810         else:
 811             player_url = None
 812
 813         # Get video info
 814         embed_webpage = None
 815         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 816             age_gate = True
 817             # We simulate the access to the video from www.youtube.com/v/{video_id}
 818             # this can be viewed without login into Youtube
 819             url = proto + '://www.youtube.com/embed/%s' % video_id
 820             embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
 821             data = compat_urllib_parse.urlencode({
 822                 'video_id': video_id,
 823                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 824                 'sts': self._search_regex(
 825                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
 826             })
 827             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 828             video_info_webpage = self._download_webpage(
 829                 video_info_url, video_id,
 830                 note='Refetching age-gated info webpage',
 831                 errnote='unable to download video info webpage')
 832             video_info = compat_parse_qs(video_info_webpage)
 833         else:
 834             age_gate = False
 835             try:
 836                 # Try looking directly into the video webpage
 837                 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 838                 if not mobj:
 839                     raise ValueError('Could not find ytplayer.config')  # caught below
 840                 json_code = uppercase_escape(mobj.group(1))
 841                 ytplayer_config = json.loads(json_code)
 842                 args = ytplayer_config['args']
 843                 # Convert to the same format returned by compat_parse_qs
 844                 video_info = dict((k, [v]) for k, v in args.items())
 845                 if 'url_encoded_fmt_stream_map' not in args:
 846                     raise ValueError('No stream_map present')  # caught below
 847             except ValueError:
 848                 # We fallback to the get_video_info pages (used by the embed page)
 849                 self.report_video_info_webpage_download(video_id)
 850                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 851                     video_info_url = (
 852                         '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 853                         % (proto, video_id, el_type))
 854                     video_info_webpage = self._download_webpage(
 855                         video_info_url,
 856                         video_id, note=False,
 857                         errnote='unable to download video info webpage')
 858                     video_info = compat_parse_qs(video_info_webpage)
 859                     if 'token' in video_info:
 860                         break
 861         if 'token' not in video_info:
 862             if 'reason' in video_info:
 863                 raise ExtractorError(
 864                     'YouTube said: %s' % video_info['reason'][0],
 865                     expected=True, video_id=video_id)
 866             else:
 867                 raise ExtractorError(
 868                     '"token" parameter not in video info for unknown reason',
 869                     video_id=video_id)
 870
 871         if 'view_count' in video_info:
 872             view_count = int(video_info['view_count'][0])
 873         else:
 874             view_count = None
 875
 876         # Check for "rental" videos
 877         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 878             raise ExtractorError('"rental" videos not supported')
 879
 880         # Start extracting information
 881         self.report_information_extraction(video_id)
 882
 883         # uploader
 884         if 'author' not in video_info:
 885             raise ExtractorError('Unable to extract uploader name')
 886         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 887
 888         # uploader_id
 889         video_uploader_id = None
 890         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 891         if mobj is not None:
 892             video_uploader_id = mobj.group(1)
 893         else:
 894             self._downloader.report_warning('unable to extract uploader nickname')
 895
 896         # title
 897         if 'title' in video_info:
 898             video_title = video_info['title'][0]
 899         else:
 900             self._downloader.report_warning('Unable to extract video title')
 901             video_title = '_'
 902
 903         # thumbnail image
 904         # We try first to get a high quality image:
 905         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 906                             video_webpage, re.DOTALL)
 907         if m_thumb is not None:
 908             video_thumbnail = m_thumb.group(1)
 909         elif 'thumbnail_url' not in video_info:
 910             self._downloader.report_warning('unable to extract video thumbnail')
 911             video_thumbnail = None
 912         else:   # don't panic if we can't find it
 913             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 914
 915         # upload date
 916         upload_date = None
 917         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 918         if mobj is None:
 919             mobj = re.search(
 920                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 921                 video_webpage)
 922         if mobj is not None:
 923             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 924             upload_date = unified_strdate(upload_date)
 925
 926         m_cat_container = self._search_regex(
 927             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 928             video_webpage, 'categories', default=None)
 929         if m_cat_container:
 930             category = self._html_search_regex(
 931                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 932                 default=None)
 933             video_categories = None if category is None else [category]
 934         else:
 935             video_categories = None
 936
 937         # description
 938         video_description = get_element_by_id("eow-description", video_webpage)
 939         if video_description:
 940             video_description = re.sub(r'''(?x)
 941                 <a\s+
 942                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 943                     title="([^"]+)"\s+
 944                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 945                     class="yt-uix-redirect-link"\s*>
 946                 [^<]+
 947                 </a>
 948             ''', r'\1', video_description)
 949             video_description = clean_html(video_description)
 950         else:
 951             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 952             if fd_mobj:
 953                 video_description = unescapeHTML(fd_mobj.group(1))
 954             else:
 955                 video_description = ''
 956
 957         def _extract_count(count_name):
 958             count = self._search_regex(
 959                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 960                 video_webpage, count_name, default=None)
 961             if count is not None:
 962                 return int(count.replace(',', ''))
 963             return None
 964         like_count = _extract_count('like')
 965         dislike_count = _extract_count('dislike')
 966
 967         # subtitles
 968         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 969
 970         if self._downloader.params.get('listsubtitles', False):
 971             self._list_available_subtitles(video_id, video_webpage)
 972             return
 973
 974         if 'length_seconds' not in video_info:
 975             self._downloader.report_warning('unable to extract video duration')
 976             video_duration = None
 977         else:
 978             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 979
 980         # annotations
 981         video_annotations = None
 982         if self._downloader.params.get('writeannotations', False):
 983             video_annotations = self._extract_annotations(video_id)
 984
 985         def _map_to_format_list(urlmap):
 986             formats = []
 987             for itag, video_real_url in urlmap.items():
 988                 dct = {
 989                     'format_id': itag,
 990                     'url': video_real_url,
 991                     'player_url': player_url,
 992                 }
 993                 if itag in self._formats:
 994                     dct.update(self._formats[itag])
 995                 formats.append(dct)
 996             return formats
 997
 998         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 999             self.report_rtmp_download()
1000             formats = [{
1001                 'format_id': '_rtmp',
1002                 'protocol': 'rtmp',
1003                 'url': video_info['conn'][0],
1004                 'player_url': player_url,
1005             }]
1006         elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
1007             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1008             if 'rtmpe%3Dyes' in encoded_url_map:
1009                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1010             url_map = {}
1011             for url_data_str in encoded_url_map.split(','):
1012                 url_data = compat_parse_qs(url_data_str)
1013                 if 'itag' not in url_data or 'url' not in url_data:
1014                     continue
1015                 format_id = url_data['itag'][0]
1016                 url = url_data['url'][0]
1017
1018                 if 'sig' in url_data:
1019                     url += '&signature=' + url_data['sig'][0]
1020                 elif 's' in url_data:
1021                     encrypted_sig = url_data['s'][0]
1022                     ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
1023
1024                     jsplayer_url_json = self._search_regex(
1025                         ASSETS_RE,
1026                         embed_webpage if age_gate else video_webpage,
1027                         'JS player URL (1)', default=None)
1028                     if not jsplayer_url_json and not age_gate:
1029                         # We need the embed website after all
1030                         if embed_webpage is None:
1031                             embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1032                             embed_webpage = self._download_webpage(
1033                                 embed_url, video_id, 'Downloading embed webpage')
1034                         jsplayer_url_json = self._search_regex(
1035                             ASSETS_RE, embed_webpage, 'JS player URL')
1036
1037                     player_url = json.loads(jsplayer_url_json)
1038                     if player_url is None:
1039                         player_url_json = self._search_regex(
1040                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1041                             video_webpage, 'age gate player URL')
1042                         player_url = json.loads(player_url_json)
1043
1044                     if self._downloader.params.get('verbose'):
1045                         if player_url is None:
1046                             player_version = 'unknown'
1047                             player_desc = 'unknown'
1048                         else:
1049                             if player_url.endswith('swf'):
1050                                 player_version = self._search_regex(
1051                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
1052                                     'flash player', fatal=False)
1053                                 player_desc = 'flash player %s' % player_version
1054                             else:
1055                                 player_version = self._search_regex(
1056                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
1057                                     player_url,
1058                                     'html5 player', fatal=False)
1059                                 player_desc = 'html5 player %s' % player_version
1060
1061                         parts_sizes = self._signature_cache_id(encrypted_sig)
1062                         self.to_screen('{%s} signature length %s, %s' %
1063                                        (format_id, parts_sizes, player_desc))
1064
1065                     signature = self._decrypt_signature(
1066                         encrypted_sig, video_id, player_url, age_gate)
1067                     url += '&signature=' + signature
1068                 if 'ratebypass' not in url:
1069                     url += '&ratebypass=yes'
1070                 url_map[format_id] = url
1071             formats = _map_to_format_list(url_map)
1072         elif video_info.get('hlsvp'):
1073             manifest_url = video_info['hlsvp'][0]
1074             url_map = self._extract_from_m3u8(manifest_url, video_id)
1075             formats = _map_to_format_list(url_map)
1076         else:
1077             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1078
1079         # Look for the DASH manifest
1080         if self._downloader.params.get('youtube_include_dash_manifest', True):
1081             dash_mpd = video_info.get('dashmpd')
1082             if dash_mpd:
1083                 dash_manifest_url = dash_mpd[0]
1084                 try:
1085                     dash_formats = self._parse_dash_manifest(
1086                         video_id, dash_manifest_url, player_url, age_gate)
1087                 except (ExtractorError, KeyError) as e:
1088                     self.report_warning(
1089                         'Skipping DASH manifest: %r' % e, video_id)
1090                 else:
1091                     # Hide the formats we found through non-DASH
1092                     dash_keys = set(df['format_id'] for df in dash_formats)
1093                     for f in formats:
1094                         if f['format_id'] in dash_keys:
1095                             f['format_id'] = 'nondash-%s' % f['format_id']
1096                             f['preference'] = f.get('preference', 0) - 10000
1097                     formats.extend(dash_formats)
1098
1099         # Check for malformed aspect ratio
1100         stretched_m = re.search(
1101             r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
1102             video_webpage)
1103         if stretched_m:
1104             ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
1105             for f in formats:
1106                 if f.get('vcodec') != 'none':
1107                     f['stretched_ratio'] = ratio
1108
1109         self._sort_formats(formats)
1110
1111         return {
1112             'id': video_id,
1113             'uploader': video_uploader,
1114             'uploader_id': video_uploader_id,
1115             'upload_date': upload_date,
1116             'title': video_title,
1117             'thumbnail': video_thumbnail,
1118             'description': video_description,
1119             'categories': video_categories,
1120             'subtitles': video_subtitles,
1121             'duration': video_duration,
1122             'age_limit': 18 if age_gate else 0,
1123             'annotations': video_annotations,
1124             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1125             'view_count': view_count,
1126             'like_count': like_count,
1127             'dislike_count': dislike_count,
1128             'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
1129             'formats': formats,
1130         }
1131
1132
1133 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1134     IE_DESC = 'YouTube.com playlists'
1135     _VALID_URL = r"""(?x)(?:
1136                         (?:https?://)?
1137                         (?:\w+\.)?
1138                         youtube\.com/
1139                         (?:
1140                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1141                            \? (?:.*?&)*? (?:p|a|list)=
1142                         |  p/
1143                         )
1144                         (
1145                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1146                             # Top tracks, they can also include dots
1147                             |(?:MC)[\w\.]*
1148                         )
1149                         .*
1150                      |
1151                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1152                      )"""
1153     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1154     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1155     IE_NAME = 'youtube:playlist'
1156     _TESTS = [{
1157         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1158         'info_dict': {
1159             'title': 'ytdl test PL',
1160             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1161         },
1162         'playlist_count': 3,
1163     }, {
1164         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1165         'info_dict': {
1166             'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1167             'title': 'YDL_Empty_List',
1168         },
1169         'playlist_count': 0,
1170     }, {
1171         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1172         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1173         'info_dict': {
1174             'title': '29C3: Not my department',
1175             'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1176         },
1177         'playlist_count': 95,
1178     }, {
1179         'note': 'issue #673',
1180         'url': 'PLBB231211A4F62143',
1181         'info_dict': {
1182             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1183             'id': 'PLBB231211A4F62143',
1184         },
1185         'playlist_mincount': 26,
1186     }, {
1187         'note': 'Large playlist',
1188         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1189         'info_dict': {
1190             'title': 'Uploads from Cauchemar',
1191             'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
1192         },
1193         'playlist_mincount': 799,
1194     }, {
1195         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1196         'info_dict': {
1197             'title': 'YDL_safe_search',
1198             'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1199         },
1200         'playlist_count': 2,
1201     }, {
1202         'note': 'embedded',
1203         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1204         'playlist_count': 4,
1205         'info_dict': {
1206             'title': 'JODA15',
1207             'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1208         }
1209     }, {
1210         'note': 'Embedded SWF player',
1211         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1212         'playlist_count': 4,
1213         'info_dict': {
1214             'title': 'JODA7',
1215             'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
1216         }
1217     }, {
1218         'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
1219         'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
1220         'info_dict': {
1221             'title': 'Uploads from Interstellar Movie',
1222             'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
1223         },
1224         'playlist_mincout': 21,
1225     }]
1226
1227     def _real_initialize(self):
1228         self._login()
1229
1230     def _ids_to_results(self, ids):
1231         return [
1232             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1233             for vid_id in ids]
1234
1235     def _extract_mix(self, playlist_id):
1236         # The mixes are generated from a a single video
1237         # the id of the playlist is just 'RD' + video_id
1238         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1239         webpage = self._download_webpage(
1240             url, playlist_id, 'Downloading Youtube mix')
1241         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1242         title_span = (
1243             search_title('playlist-title') or
1244             search_title('title long-title') or
1245             search_title('title'))
1246         title = clean_html(title_span)
1247         ids = orderedSet(re.findall(
1248             r'''(?xs)data-video-username=".*?".*?
1249                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1250             webpage))
1251         url_results = self._ids_to_results(ids)
1252
1253         return self.playlist_result(url_results, playlist_id, title)
1254
1255     def _real_extract(self, url):
1256         # Extract playlist id
1257         mobj = re.match(self._VALID_URL, url)
1258         if mobj is None:
1259             raise ExtractorError('Invalid URL: %s' % url)
1260         playlist_id = mobj.group(1) or mobj.group(2)
1261
1262         # Check if it's a video-specific URL
1263         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1264         if 'v' in query_dict:
1265             video_id = query_dict['v'][0]
1266             if self._downloader.params.get('noplaylist'):
1267                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1268                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1269             else:
1270                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1271
1272         if playlist_id.startswith('RD'):
1273             # Mixes require a custom extraction process
1274             return self._extract_mix(playlist_id)
1275
1276         url = self._TEMPLATE_URL % playlist_id
1277         page = self._download_webpage(url, playlist_id)
1278         more_widget_html = content_html = page
1279
1280         # Check if the playlist exists or is private
1281         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1282             raise ExtractorError(
1283                 'The playlist doesn\'t exist or is private, use --username or '
1284                 '--netrc to access it.',
1285                 expected=True)
1286
1287         # Extract the video ids from the playlist pages
1288         ids = []
1289
1290         for page_num in itertools.count(1):
1291             matches = re.finditer(self._VIDEO_RE, content_html)
1292             # We remove the duplicates and the link with index 0
1293             # (it's not the first video of the playlist)
1294             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1295             ids.extend(new_ids)
1296
1297             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1298             if not mobj:
1299                 break
1300
1301             more = self._download_json(
1302                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1303                 'Downloading page #%s' % page_num,
1304                 transform_source=uppercase_escape)
1305             content_html = more['content_html']
1306             if not content_html.strip():
1307                 # Some webpages show a "Load more" button but they don't
1308                 # have more videos
1309                 break
1310             more_widget_html = more['load_more_widget_html']
1311
1312         playlist_title = self._html_search_regex(
1313             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1314             page, 'title')
1315
1316         url_results = self._ids_to_results(ids)
1317         return self.playlist_result(url_results, playlist_id, playlist_title)
1318
1319
1320 class YoutubeChannelIE(InfoExtractor):
1321     IE_DESC = 'YouTube.com channels'
1322     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
1323     IE_NAME = 'youtube:channel'
1324     _TESTS = [{
1325         'note': 'paginated channel',
1326         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1327         'playlist_mincount': 91,
1328         'info_dict': {
1329             'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
1330         }
1331     }]
1332
1333     def extract_videos_from_page(self, page):
1334         ids_in_page = []
1335         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1336             if mobj.group(1) not in ids_in_page:
1337                 ids_in_page.append(mobj.group(1))
1338         return ids_in_page
1339
1340     def _real_extract(self, url):
1341         channel_id = self._match_id(url)
1342
1343         video_ids = []
1344         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1345         channel_page = self._download_webpage(url, channel_id)
1346         autogenerated = re.search(r'''(?x)
1347                 class="[^"]*?(?:
1348                     channel-header-autogenerated-label|
1349                     yt-channel-title-autogenerated
1350                 )[^"]*"''', channel_page) is not None
1351
1352         if autogenerated:
1353             # The videos are contained in a single page
1354             # the ajax pages can't be used, they are empty
1355             video_ids = self.extract_videos_from_page(channel_page)
1356             entries = [
1357                 self.url_result(video_id, 'Youtube', video_id=video_id)
1358                 for video_id in video_ids]
1359             return self.playlist_result(entries, channel_id)
1360
1361         def _entries():
1362             more_widget_html = content_html = channel_page
1363             for pagenum in itertools.count(1):
1364
1365                 ids_in_page = self.extract_videos_from_page(content_html)
1366                 for video_id in ids_in_page:
1367                     yield self.url_result(
1368                         video_id, 'Youtube', video_id=video_id)
1369
1370                 mobj = re.search(
1371                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
1372                     more_widget_html)
1373                 if not mobj:
1374                     break
1375
1376                 more = self._download_json(
1377                     'https://youtube.com/%s' % mobj.group('more'), channel_id,
1378                     'Downloading page #%s' % (pagenum + 1),
1379                     transform_source=uppercase_escape)
1380                 content_html = more['content_html']
1381                 more_widget_html = more['load_more_widget_html']
1382
1383         return self.playlist_result(_entries(), channel_id)
1384
1385
1386 class YoutubeUserIE(InfoExtractor):
1387     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1388     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
1389     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1390     _GDATA_PAGE_SIZE = 50
1391     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1392     IE_NAME = 'youtube:user'
1393
1394     _TESTS = [{
1395         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1396         'playlist_mincount': 320,
1397         'info_dict': {
1398             'title': 'TheLinuxFoundation',
1399         }
1400     }, {
1401         'url': 'ytuser:phihag',
1402         'only_matching': True,
1403     }]
1404
1405     @classmethod
1406     def suitable(cls, url):
1407         # Don't return True if the url can be extracted with other youtube
1408         # extractor, the regex would is too permissive and it would match.
1409         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1410         if any(ie.suitable(url) for ie in other_ies):
1411             return False
1412         else:
1413             return super(YoutubeUserIE, cls).suitable(url)
1414
1415     def _real_extract(self, url):
1416         username = self._match_id(url)
1417
1418         # Download video ids using YouTube Data API. Result size per
1419         # query is limited (currently to 50 videos) so we need to query
1420         # page by page until there are no video ids - it means we got
1421         # all of them.
1422
1423         def download_page(pagenum):
1424             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1425
1426             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1427             page = self._download_webpage(
1428                 gdata_url, username,
1429                 'Downloading video ids from %d to %d' % (
1430                     start_index, start_index + self._GDATA_PAGE_SIZE))
1431
1432             try:
1433                 response = json.loads(page)
1434             except ValueError as err:
1435                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1436             if 'entry' not in response['feed']:
1437                 return
1438
1439             # Extract video identifiers
1440             entries = response['feed']['entry']
1441             for entry in entries:
1442                 title = entry['title']['$t']
1443                 video_id = entry['id']['$t'].split('/')[-1]
1444                 yield {
1445                     '_type': 'url',
1446                     'url': video_id,
1447                     'ie_key': 'Youtube',
1448                     'id': video_id,
1449                     'title': title,
1450                 }
1451         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1452
1453         return self.playlist_result(url_results, playlist_title=username)
1454
1455
1456 class YoutubeSearchIE(SearchInfoExtractor):
1457     IE_DESC = 'YouTube.com searches'
1458     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459     _MAX_RESULTS = 1000
1460     IE_NAME = 'youtube:search'
1461     _SEARCH_KEY = 'ytsearch'
1462
1463     def _get_n_results(self, query, n):
1464         """Get a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469         PAGE_SIZE = 50
1470
1471         while (PAGE_SIZE * pagenum) < limit:
1472             result_url = self._API_URL % (
1473                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1474                 (PAGE_SIZE * pagenum) + 1)
1475             data_json = self._download_webpage(
1476                 result_url, video_id='query "%s"' % query,
1477                 note='Downloading page %s' % (pagenum + 1),
1478                 errnote='Unable to download API page')
1479             data = json.loads(data_json)
1480             api_response = data['data']
1481
1482             if 'items' not in api_response:
1483                 raise ExtractorError(
1484                     '[youtube] No video results', expected=True)
1485
1486             new_ids = list(video['id'] for video in api_response['items'])
1487             video_ids += new_ids
1488
1489             limit = min(n, api_response['totalItems'])
1490             pagenum += 1
1491
1492         if len(video_ids) > n:
1493             video_ids = video_ids[:n]
1494         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1495                   for video_id in video_ids]
1496         return self.playlist_result(videos, query)
1497
1498
1499 class YoutubeSearchDateIE(YoutubeSearchIE):
1500     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1501     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1502     _SEARCH_KEY = 'ytsearchdate'
1503     IE_DESC = 'YouTube.com searches, newest videos first'
1504
1505
1506 class YoutubeSearchURLIE(InfoExtractor):
1507     IE_DESC = 'YouTube.com search URLs'
1508     IE_NAME = 'youtube:search_url'
1509     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1510     _TESTS = [{
1511         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1512         'playlist_mincount': 5,
1513         'info_dict': {
1514             'title': 'youtube-dl test video',
1515         }
1516     }]
1517
1518     def _real_extract(self, url):
1519         mobj = re.match(self._VALID_URL, url)
1520         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1521
1522         webpage = self._download_webpage(url, query)
1523         result_code = self._search_regex(
1524             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1525
1526         part_codes = re.findall(
1527             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1528         entries = []
1529         for part_code in part_codes:
1530             part_title = self._html_search_regex(
1531                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1532             part_url_snippet = self._html_search_regex(
1533                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1534             part_url = compat_urlparse.urljoin(
1535                 'https://www.youtube.com/', part_url_snippet)
1536             entries.append({
1537                 '_type': 'url',
1538                 'url': part_url,
1539                 'title': part_title,
1540             })
1541
1542         return {
1543             '_type': 'playlist',
1544             'entries': entries,
1545             'title': query,
1546         }
1547
1548
1549 class YoutubeShowIE(InfoExtractor):
1550     IE_DESC = 'YouTube.com (multi-season) shows'
1551     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1552     IE_NAME = 'youtube:show'
1553     _TESTS = [{
1554         'url': 'http://www.youtube.com/show/airdisasters',
1555         'playlist_mincount': 3,
1556         'info_dict': {
1557             'id': 'airdisasters',
1558             'title': 'Air Disasters',
1559         }
1560     }]
1561
1562     def _real_extract(self, url):
1563         mobj = re.match(self._VALID_URL, url)
1564         playlist_id = mobj.group('id')
1565         webpage = self._download_webpage(
1566             url, playlist_id, 'Downloading show webpage')
1567         # There's one playlist for each season of the show
1568         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1569         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1570         entries = [
1571             self.url_result(
1572                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1573             for season in m_seasons
1574         ]
1575         title = self._og_search_title(webpage, fatal=False)
1576
1577         return {
1578             '_type': 'playlist',
1579             'id': playlist_id,
1580             'title': title,
1581             'entries': entries,
1582         }
1583
1584
1585 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1586     """
1587     Base class for extractors that fetch info from
1588     http://www.youtube.com/feed_ajax
1589     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1590     """
1591     _LOGIN_REQUIRED = True
1592     # use action_load_personal_feed instead of action_load_system_feed
1593     _PERSONAL_FEED = False
1594
1595     @property
1596     def _FEED_TEMPLATE(self):
1597         action = 'action_load_system_feed'
1598         if self._PERSONAL_FEED:
1599             action = 'action_load_personal_feed'
1600         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1601
1602     @property
1603     def IE_NAME(self):
1604         return 'youtube:%s' % self._FEED_NAME
1605
1606     def _real_initialize(self):
1607         self._login()
1608
1609     def _real_extract(self, url):
1610         feed_entries = []
1611         paging = 0
1612         for i in itertools.count(1):
1613             info = self._download_json(
1614                 self._FEED_TEMPLATE % paging,
1615                 '%s feed' % self._FEED_NAME,
1616                 'Downloading page %s' % i,
1617                 transform_source=uppercase_escape)
1618             feed_html = info.get('feed_html') or info.get('content_html')
1619             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1620             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1621             ids = orderedSet(m.group(1) for m in m_ids)
1622             feed_entries.extend(
1623                 self.url_result(video_id, 'Youtube', video_id=video_id)
1624                 for video_id in ids)
1625             mobj = re.search(
1626                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1627                 load_more_widget_html)
1628             if mobj is None:
1629                 break
1630             paging = mobj.group('paging')
1631         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1632
1633
1634 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1635     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
1636     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1637     _FEED_NAME = 'recommended'
1638     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1639
1640
1641 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1642     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
1643     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1644     _FEED_NAME = 'watch_later'
1645     _PLAYLIST_TITLE = 'Youtube Watch Later'
1646     _PERSONAL_FEED = True
1647
1648
1649 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1650     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
1651     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1652     _FEED_NAME = 'history'
1653     _PERSONAL_FEED = True
1654     _PLAYLIST_TITLE = 'Youtube Watch History'
1655
1656
1657 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1658     IE_NAME = 'youtube:favorites'
1659     IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
1660     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1661     _LOGIN_REQUIRED = True
1662
1663     def _real_extract(self, url):
1664         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1665         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1666         return self.url_result(playlist_id, 'YoutubePlaylist')
1667
1668
1669 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1670     IE_NAME = 'youtube:subscriptions'
1671     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1672     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1673     _TESTS = []
1674
1675     def _real_extract(self, url):
1676         title = 'Youtube Subscriptions'
1677         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1678
1679         # The extraction process is the same as for playlists, but the regex
1680         # for the video ids doesn't contain an index
1681         ids = []
1682         more_widget_html = content_html = page
1683
1684         for page_num in itertools.count(1):
1685             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1686             new_ids = orderedSet(matches)
1687             ids.extend(new_ids)
1688
1689             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1690             if not mobj:
1691                 break
1692
1693             more = self._download_json(
1694                 'https://youtube.com/%s' % mobj.group('more'), title,
1695                 'Downloading page #%s' % page_num,
1696                 transform_source=uppercase_escape)
1697             content_html = more['content_html']
1698             more_widget_html = more['load_more_widget_html']
1699
1700         return {
1701             '_type': 'playlist',
1702             'title': title,
1703             'entries': self._ids_to_results(ids),
1704         }
1705
1706
1707 class YoutubeTruncatedURLIE(InfoExtractor):
1708     IE_NAME = 'youtube:truncated_url'
1709     IE_DESC = False  # Do not list
1710     _VALID_URL = r'''(?x)
1711         (?:https?://)?
1712         (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
1713         (?:watch\?(?:
1714             feature=[a-z_]+|
1715             annotation_id=annotation_[^&]+|
1716             x-yt-cl=[0-9]+|
1717             hl=[^&]*|
1718         )?
1719         |
1720             attribution_link\?a=[^&]+
1721         )
1722         $
1723     '''
1724
1725     _TESTS = [{
1726         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1727         'only_matching': True,
1728     }, {
1729         'url': 'http://www.youtube.com/watch?',
1730         'only_matching': True,
1731     }, {
1732         'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
1733         'only_matching': True,
1734     }, {
1735         'url': 'https://www.youtube.com/watch?feature=foo',
1736         'only_matching': True,
1737     }, {
1738         'url': 'https://www.youtube.com/watch?hl=en-GB',
1739         'only_matching': True,
1740     }]
1741
1742     def _real_extract(self, url):
1743         raise ExtractorError(
1744             'Did you forget to quote the URL? Remember that & is a meta '
1745             'character in most shells, so you want to put the URL in quotes, '
1746             'like  youtube-dl '
1747             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1748             ' or simply  youtube-dl BaW_jenozKc  .',
1749             expected=True)
1750
1751
1752 class YoutubeTruncatedIDIE(InfoExtractor):
1753     IE_NAME = 'youtube:truncated_id'
1754     IE_DESC = False  # Do not list
1755     _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
1756
1757     _TESTS = [{
1758         'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
1759         'only_matching': True,
1760     }]
1761
1762     def _real_extract(self, url):
1763         video_id = self._match_id(url)
1764         raise ExtractorError(
1765             'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
1766             expected=True)