[youtube] Added 'subscriber_count' to extraction

[yt-dlp.git] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 43c5eff1f4ed9e3e3fd61041875ed4f67dc04bad..f0d2a8873c2173b4ebf9b352f4a59ce58656a503 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -39,6 +39,7 @@
      mimetype2ext,
      orderedSet,
      parse_codecs,
+    parse_count,
      parse_duration,
      remove_quotes,
      remove_start,
@@ -303,7 +304,7 @@ def _entries(self, page, playlist_id):
                      # Downloading page may result in intermittent 5xx HTTP error
                      # that is usually worked around with a retry
                      more = self._download_json(
-                        'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                        'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
                          'Downloading page #%s%s'
                          % (page_num, ' (retry #%d)' % count if count else ''),
                          transform_source=uppercase_escape,
@@ -1264,7 +1265,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'params': {
                  'skip_download': True,
              },
-        }
+        },
+        {
+            # empty description results in an empty string
+            'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
+            'info_dict': {
+                'id': 'x41yOUIvK2k',
+                'ext': 'mp4',
+                'title': 'IMG 3456',
+                'description': '',
+                'upload_date': '20170613',
+                'uploader_id': 'ElevageOrVert',
+                'uploader': 'ElevageOrVert',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -1384,7 +1401,7 @@ def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
              (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
               r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
-             r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
               r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
               # Obsolete patterns
               r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@@ -1825,7 +1842,8 @@ def extract_player_response(player_response, video_id):
          # Get video info
          video_info = {}
          embed_webpage = None
-        if re.search(r'player-age-gate-content">', video_webpage) is not None:
+        if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
+                or re.search(r'player-age-gate-content">', video_webpage) is not None):
              age_gate = True
              # We simulate the access to the video from www.youtube.com/v/{video_id}
              # this can be viewed without login into Youtube
@@ -1930,7 +1948,9 @@ def replace_url(m):
              ''', replace_url, video_description)
              video_description = clean_html(video_description)
          else:
-            video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')
+            video_description = video_details.get('shortDescription')
+            if video_description is None:
+                video_description = self._html_search_meta('description', video_webpage)
  
          if not smuggled_data.get('force_singlefeed', False):
              if not self._downloader.params.get('noplaylist'):
@@ -2402,6 +2422,14 @@ def _extract_count(count_name):
              video_duration = parse_duration(self._html_search_meta(
                  'duration', video_webpage, 'video duration'))
  
+        # Get Subscriber Count of channel
+        subscriber_count = parse_count(self._search_regex(
+            r'"text":"([\d\.]+\w?) subscribers"',
+            video_webpage,
+            'subscriber count',
+            default=None
+        ))
+
          # annotations
          video_annotations = None
          if self._downloader.params.get('writeannotations', False):
@@ -2539,6 +2567,7 @@ def decrypt_sig(mobj):
              'album': album,
              'release_date': release_date,
              'release_year': release_year,
+            'subscriber_count': subscriber_count,
          }
  
  
@@ -2776,7 +2805,7 @@ def _extract_mix(self, playlist_id):
          ids = []
          last_id = playlist_id[-11:]
          for n in itertools.count(1):
-            url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+            url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
              webpage = self._download_webpage(
                  url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
              new_ids = orderedSet(re.findall(
@@ -3008,7 +3037,7 @@ def _real_extract(self, url):
  
  class YoutubeUserIE(YoutubeChannelIE):
      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
      _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
      IE_NAME = 'youtube:user'
  
@@ -3038,6 +3067,9 @@ class YoutubeUserIE(YoutubeChannelIE):
      }, {
          'url': 'https://www.youtube.com/c/gametrailers',
          'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
+        'only_matching': True,
      }, {
          'url': 'https://www.youtube.com/gametrailers',
          'only_matching': True,
@@ -3289,7 +3321,7 @@ def _entries(self, page):
                  break
  
              more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
                  'Downloading page #%s' % page_num,
                  transform_source=uppercase_escape,
                  headers=self._YOUTUBE_CLIENT_HEADERS)