[crunchyroll:beta] Add cookies support (#2506)

[yt-dlp.git] / yt_dlp / extractor / reddit.py
diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py

index 14592bc62cdae6410b5c9e84c98977fca059a3b0..a042a59cc4de0273b0d1f569780bae55b001a0cb 100644 (file)
--- a/yt_dlp/extractor/reddit.py
+++ b/yt_dlp/extractor/reddit.py
@@ -1,5 +1,4 @@
-from __future__ import unicode_literals
-
+import random
  
  from .common import InfoExtractor
  from ..utils import (
@@ -9,47 +8,12 @@
      try_get,
      unescapeHTML,
      url_or_none,
+    traverse_obj
  )
  
  
  class RedditIE(InfoExtractor):
-    _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
-    _TEST = {
-        # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
-        'url': 'https://v.redd.it/zv89llsvexdz',
-        'md5': '0a070c53eba7ec4534d95a5a1259e253',
-        'info_dict': {
-            'id': 'zv89llsvexdz',
-            'ext': 'mp4',
-            'title': 'zv89llsvexdz',
-        },
-        'params': {
-            'format': 'bestvideo',
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        formats = self._extract_m3u8_formats(
-            'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
-            'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
-
-        formats.extend(self._extract_mpd_formats(
-            'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
-            mpd_id='dash', fatal=False))
-
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': video_id,
-            'formats': formats,
-        }
-
-
-class RedditRIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
+    _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
      _TESTS = [{
          'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
          'info_dict': {
@@ -68,7 +32,6 @@ class RedditRIE(InfoExtractor):
              'age_limit': 0,
          },
          'params': {
-            'format': 'bestvideo',
              'skip_download': True,
          },
      }, {
@@ -99,13 +62,22 @@ class RedditRIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    def _real_extract(self, url):
-        slug, video_id = self._match_valid_url(url).group('slug', 'id')
-
-        self._set_cookie('reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
-        data = self._download_json(
-            f'https://old.reddit.com/r/{slug}/.json', video_id)[0]['data']['children'][0]['data']
+    @staticmethod
+    def _gen_session_id():
+        id_length = 16
+        rand_max = 1 << (id_length * 4)
+        return '%0.*x' % (id_length, random.randrange(rand_max))
  
+    def _real_extract(self, url):
+        subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id')
+
+        self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id())
+        self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
+        data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False)
+        if not data:
+            # Fall back to old.reddit.com in case the requested subdomain fails
+            data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id)
+        data = data[0]['data']['children'][0]['data']
          video_url = data['url']
  
          # Avoid recursing into the same reddit URL
@@ -143,19 +115,53 @@ def add_thumbnail(src):
                  for resolution in resolutions:
                      add_thumbnail(resolution)
  
-        return {
-            '_type': 'url_transparent',
-            'url': video_url,
+        info = {
              'title': data.get('title'),
              'thumbnails': thumbnails,
              'timestamp': float_or_none(data.get('created_utc')),
              'uploader': data.get('author'),
-            'duration': int_or_none(try_get(
-                data,
-                (lambda x: x['media']['reddit_video']['duration'],
-                 lambda x: x['secure_media']['reddit_video']['duration']))),
              'like_count': int_or_none(data.get('ups')),
              'dislike_count': int_or_none(data.get('downs')),
              'comment_count': int_or_none(data.get('num_comments')),
              'age_limit': age_limit,
          }
+
+        # Check if media is hosted on reddit:
+        reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False)
+        if reddit_video:
+            playlist_urls = [
+                try_get(reddit_video, lambda x: unescapeHTML(x[y]))
+                for y in ('dash_url', 'hls_url')
+            ]
+
+            # Update video_id
+            display_id = video_id
+            video_id = self._search_regex(
+                r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'],
+                'video_id', default=display_id)
+
+            dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd'
+            hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8'
+
+            formats = self._extract_m3u8_formats(
+                hls_playlist_url, display_id, 'mp4',
+                entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+            formats.extend(self._extract_mpd_formats(
+                dash_playlist_url, display_id, mpd_id='dash', fatal=False))
+            self._sort_formats(formats)
+
+            return {
+                **info,
+                'id': video_id,
+                'display_id': display_id,
+                'formats': formats,
+                'duration': int_or_none(reddit_video.get('duration')),
+            }
+
+        # Not hosted on reddit, must continue extraction
+        return {
+            **info,
+            'display_id': video_id,
+            '_type': 'url_transparent',
+            'url': video_url,
+        }