yt_dlp/extractor/pr0gramm.py

   1 import json
   2 from urllib.parse import unquote
   3
   4 from .common import InfoExtractor
   5 from ..compat import functools
   6 from ..utils import (
   7     ExtractorError,
   8     float_or_none,
   9     int_or_none,
  10     make_archive_id,
  11     mimetype2ext,
  12     str_or_none,
  13     urljoin,
  14 )
  15 from ..utils.traversal import traverse_obj
  16
  17
  18 class Pr0grammIE(InfoExtractor):
  19     _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)'
  20     _TESTS = [{
  21         'url': 'https://pr0gramm.com/new/video/5466437',
  22         'info_dict': {
  23             'id': '5466437',
  24             'ext': 'mp4',
  25             'title': 'pr0gramm-5466437 by g11st',
  26             'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'],
  27             'uploader': 'g11st',
  28             'uploader_id': '394718',
  29             'timestamp': 1671590240,
  30             'upload_date': '20221221',
  31             'like_count': int,
  32             'dislike_count': int,
  33             'age_limit': 0,
  34             'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
  35             '_old_archive_ids': ['pr0grammstatic 5466437'],
  36         },
  37     }, {
  38         'url': 'https://pr0gramm.com/new/3052805:comment28391322',
  39         'info_dict': {
  40             'id': '3052805',
  41             'ext': 'mp4',
  42             'title': 'pr0gramm-3052805 by Hansking1',
  43             'tags': 'count:15',
  44             'uploader': 'Hansking1',
  45             'uploader_id': '385563',
  46             'timestamp': 1552930408,
  47             'upload_date': '20190318',
  48             'like_count': int,
  49             'dislike_count': int,
  50             'age_limit': 0,
  51             'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
  52             '_old_archive_ids': ['pr0grammstatic 3052805'],
  53         },
  54     }, {
  55         # Requires verified account
  56         'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332',
  57         'info_dict': {
  58             'id': '5848332',
  59             'ext': 'mp4',
  60             'title': 'pr0gramm-5848332 by erd0pfel',
  61             'tags': 'count:18',
  62             'uploader': 'erd0pfel',
  63             'uploader_id': '349094',
  64             'timestamp': 1694489652,
  65             'upload_date': '20230912',
  66             'like_count': int,
  67             'dislike_count': int,
  68             'age_limit': 18,
  69             'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg',
  70             '_old_archive_ids': ['pr0grammstatic 5848332'],
  71         },
  72     }, {
  73         'url': 'https://pr0gramm.com/top/5895149',
  74         'info_dict': {
  75             'id': '5895149',
  76             'ext': 'mp4',
  77             'title': 'pr0gramm-5895149 by algoholigSeeManThrower',
  78             'tags': 'count:19',
  79             'uploader': 'algoholigSeeManThrower',
  80             'uploader_id': '457556',
  81             'timestamp': 1697580902,
  82             'upload_date': '20231018',
  83             'like_count': int,
  84             'dislike_count': int,
  85             'age_limit': 0,
  86             'thumbnail': 'https://thumb.pr0gramm.com/2023/10/18/db47bb3db5e1a1b3.jpg',
  87             '_old_archive_ids': ['pr0grammstatic 5895149'],
  88         },
  89     }, {
  90         'url': 'https://pr0gramm.com/static/5466437',
  91         'only_matching': True,
  92     }, {
  93         'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805',
  94         'only_matching': True,
  95     }, {
  96         'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290',
  97         'only_matching': True,
  98     }]
  99
 100     BASE_URL = 'https://pr0gramm.com'
 101
 102     @functools.cached_property
 103     def _is_logged_in(self):
 104         return 'pp' in self._get_cookies(self.BASE_URL)
 105
 106     @functools.cached_property
 107     def _maximum_flags(self):
 108         # We need to guess the flags for the content otherwise the api will raise an error
 109         # We can guess the maximum allowed flags for the account from the cookies
 110         # Bitflags are (msbf): pol, nsfp, nsfl, nsfw, sfw
 111         flags = 0b10001
 112         if self._is_logged_in:
 113             flags |= 0b01000
 114             cookies = self._get_cookies(self.BASE_URL)
 115             if 'me' not in cookies:
 116                 self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
 117             if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
 118                 flags |= 0b00110
 119
 120         return flags
 121
 122     def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'):
 123         data = self._download_json(
 124             f'https://pr0gramm.com/api/items/{endpoint}',
 125             video_id, note, query=query, expected_status=403)
 126
 127         error = traverse_obj(data, ('error', {str}))
 128         if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'):
 129             if not self._is_logged_in:
 130                 self.raise_login_required()
 131             raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True)
 132         elif error:
 133             message = traverse_obj(data, ('msg', {str})) or error
 134             raise ExtractorError(f'API returned error: {message}', expected=True)
 135
 136         return data
 137
 138     @staticmethod
 139     def _create_source_url(path):
 140         return urljoin('https://img.pr0gramm.com', path)
 141
 142     def _real_extract(self, url):
 143         video_id = self._match_id(url)
 144         video_info = traverse_obj(
 145             self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}),
 146             ('items', 0, {dict}))
 147
 148         source = video_info.get('image')
 149         if not source or not source.endswith('mp4'):
 150             self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id)
 151
 152         metadata = self._call_api('info', video_id, {'itemId': video_id}, note='Downloading tags')
 153         tags = traverse_obj(metadata, ('tags', ..., 'tag', {str}))
 154         # Sorted by "confidence", higher confidence = earlier in list
 155         confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float})))
 156         if confidences:
 157             tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)]
 158
 159         formats = traverse_obj(video_info, ('variants', ..., {
 160             'format_id': ('name', {str}),
 161             'url': ('path', {self._create_source_url}),
 162             'ext': ('mimeType', {mimetype2ext}),
 163             'vcodec': ('codec', {str}),
 164             'width': ('width', {int_or_none}),
 165             'height': ('height', {int_or_none}),
 166             'bitrate': ('bitRate', {float_or_none}),
 167             'filesize': ('fileSize', {int_or_none}),
 168         })) if video_info.get('variants') else [{
 169             'ext': 'mp4',
 170             'format_id': 'source',
 171             **traverse_obj(video_info, {
 172                 'url': ('image', {self._create_source_url}),
 173                 'width': ('width', {int_or_none}),
 174                 'height': ('height', {int_or_none}),
 175             }),
 176         }]
 177
 178         subtitles = {}
 179         for subtitle in traverse_obj(video_info, ('subtitles', lambda _, v: v['language'])):
 180             subtitles.setdefault(subtitle['language'], []).append(traverse_obj(subtitle, {
 181                 'url': ('path', {self._create_source_url}),
 182                 'note': ('label', {str}),
 183             }))
 184
 185         return {
 186             'id': video_id,
 187             'title': f'pr0gramm-{video_id} by {video_info.get("user")}',
 188             'tags': tags,
 189             'formats': formats,
 190             'subtitles': subtitles,
 191             'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0,
 192             '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)],
 193             **traverse_obj(video_info, {
 194                 'uploader': ('user', {str}),
 195                 'uploader_id': ('userId', {str_or_none}),
 196                 'like_count': ('up', {int}),
 197                 'dislike_count': ('down', {int}),
 198                 'timestamp': ('created', {int}),
 199                 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
 200             }),
 201         }