yt_dlp/extractor/reddit.py

   1 import random
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     ExtractorError,
   6     int_or_none,
   7     float_or_none,
   8     try_get,
   9     unescapeHTML,
  10     url_or_none,
  11     traverse_obj
  12 )
  13
  14
  15 class RedditIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))'
  17     _TESTS = [{
  18         'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
  19         'info_dict': {
  20             'id': 'zv89llsvexdz',
  21             'ext': 'mp4',
  22             'title': 'That small heart attack.',
  23             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  24             'thumbnails': 'count:4',
  25             'timestamp': 1501941939,
  26             'upload_date': '20170805',
  27             'uploader': 'Antw87',
  28             'duration': 12,
  29             'like_count': int,
  30             'dislike_count': int,
  31             'comment_count': int,
  32             'age_limit': 0,
  33         },
  34         'params': {
  35             'skip_download': True,
  36         },
  37     }, {
  38         'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
  39         'only_matching': True,
  40     }, {
  41         # imgur
  42         'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
  43         'only_matching': True,
  44     }, {
  45         # imgur @ old reddit
  46         'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
  47         'only_matching': True,
  48     }, {
  49         # streamable
  50         'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/',
  51         'only_matching': True,
  52     }, {
  53         # youtube
  54         'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/',
  55         'only_matching': True,
  56     }, {
  57         # reddit video @ nm reddit
  58         'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/',
  59         'only_matching': True,
  60     }, {
  61         'url': 'https://www.redditmedia.com/r/serbia/comments/pu9wbx/ako_vu%C4%8Di%C4%87_izgubi_izbore_ja_%C4%87u_da_crknem/',
  62         'only_matching': True,
  63     }]
  64
  65     @staticmethod
  66     def _gen_session_id():
  67         id_length = 16
  68         rand_max = 1 << (id_length * 4)
  69         return '%0.*x' % (id_length, random.randrange(rand_max))
  70
  71     def _real_extract(self, url):
  72         subdomain, slug, video_id = self._match_valid_url(url).group('subdomain', 'slug', 'id')
  73
  74         self._set_cookie('.reddit.com', 'reddit_session', self._gen_session_id())
  75         self._set_cookie('.reddit.com', '_options', '%7B%22pref_quarantine_optin%22%3A%20true%7D')
  76         data = self._download_json(f'https://{subdomain}reddit.com/r/{slug}/.json', video_id, fatal=False)
  77         if not data:
  78             # Fall back to old.reddit.com in case the requested subdomain fails
  79             data = self._download_json(f'https://old.reddit.com/r/{slug}/.json', video_id)
  80         data = data[0]['data']['children'][0]['data']
  81         video_url = data['url']
  82
  83         # Avoid recursing into the same reddit URL
  84         if 'reddit.com/' in video_url and '/%s/' % video_id in video_url:
  85             raise ExtractorError('No media found', expected=True)
  86
  87         over_18 = data.get('over_18')
  88         if over_18 is True:
  89             age_limit = 18
  90         elif over_18 is False:
  91             age_limit = 0
  92         else:
  93             age_limit = None
  94
  95         thumbnails = []
  96
  97         def add_thumbnail(src):
  98             if not isinstance(src, dict):
  99                 return
 100             thumbnail_url = url_or_none(src.get('url'))
 101             if not thumbnail_url:
 102                 return
 103             thumbnails.append({
 104                 'url': unescapeHTML(thumbnail_url),
 105                 'width': int_or_none(src.get('width')),
 106                 'height': int_or_none(src.get('height')),
 107             })
 108
 109         for image in try_get(data, lambda x: x['preview']['images']) or []:
 110             if not isinstance(image, dict):
 111                 continue
 112             add_thumbnail(image.get('source'))
 113             resolutions = image.get('resolutions')
 114             if isinstance(resolutions, list):
 115                 for resolution in resolutions:
 116                     add_thumbnail(resolution)
 117
 118         info = {
 119             'title': data.get('title'),
 120             'thumbnails': thumbnails,
 121             'timestamp': float_or_none(data.get('created_utc')),
 122             'uploader': data.get('author'),
 123             'like_count': int_or_none(data.get('ups')),
 124             'dislike_count': int_or_none(data.get('downs')),
 125             'comment_count': int_or_none(data.get('num_comments')),
 126             'age_limit': age_limit,
 127         }
 128
 129         # Check if media is hosted on reddit:
 130         reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False)
 131         if reddit_video:
 132             playlist_urls = [
 133                 try_get(reddit_video, lambda x: unescapeHTML(x[y]))
 134                 for y in ('dash_url', 'hls_url')
 135             ]
 136
 137             # Update video_id
 138             display_id = video_id
 139             video_id = self._search_regex(
 140                 r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'],
 141                 'video_id', default=display_id)
 142
 143             dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd'
 144             hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8'
 145
 146             formats = self._extract_m3u8_formats(
 147                 hls_playlist_url, display_id, 'mp4',
 148                 entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
 149             formats.extend(self._extract_mpd_formats(
 150                 dash_playlist_url, display_id, mpd_id='dash', fatal=False))
 151             self._sort_formats(formats)
 152
 153             return {
 154                 **info,
 155                 'id': video_id,
 156                 'display_id': display_id,
 157                 'formats': formats,
 158                 'duration': int_or_none(reddit_video.get('duration')),
 159             }
 160
 161         # Not hosted on reddit, must continue extraction
 162         return {
 163             **info,
 164             'display_id': video_id,
 165             '_type': 'url_transparent',
 166             'url': video_url,
 167         }