yt_dlp/extractor/wykop.py

   1 import json
   2 import urllib.error
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     format_field,
   8     parse_iso8601,
   9     traverse_obj,
  10     url_or_none,
  11 )
  12
  13
  14 class WykopBaseExtractor(InfoExtractor):
  15     def _get_token(self, force_refresh=False):
  16         if not force_refresh:
  17             maybe_cached = self.cache.load('wykop', 'bearer')
  18             if maybe_cached:
  19                 return maybe_cached
  20
  21         new_token = traverse_obj(
  22             self._do_call_api('auth', None, 'Downloading anonymous auth token', data={
  23                 # hardcoded in frontend
  24                 'key': 'w53947240748',
  25                 'secret': 'd537d9e0a7adc1510842059ae5316419',
  26             }), ('data', 'token'))
  27
  28         self.cache.store('wykop', 'bearer', new_token)
  29         return new_token
  30
  31     def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}):
  32         if data:
  33             data = json.dumps({'data': data}).encode()
  34             headers['Content-Type'] = 'application/json'
  35
  36         return self._download_json(
  37             f'https://wykop.pl/api/v3/{path}', video_id,
  38             note=note, data=data, headers=headers)
  39
  40     def _call_api(self, path, video_id, note='Downloading JSON metadata'):
  41         token = self._get_token()
  42         for retrying in range(2):
  43             try:
  44                 return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'})
  45             except ExtractorError as e:
  46                 if not retrying and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403:
  47                     token = self._get_token(True)
  48                     continue
  49                 raise
  50
  51     def _common_data_extract(self, data):
  52         author = traverse_obj(data, ('author', 'username'), expected_type=str)
  53
  54         return {
  55             '_type': 'url_transparent',
  56             'display_id': data.get('slug'),
  57             'url': traverse_obj(data,
  58                                 ('media', 'embed', 'url'),  # what gets an iframe embed
  59                                 ('source', 'url'),  # clickable url (dig only)
  60                                 expected_type=url_or_none),
  61             'thumbnail': traverse_obj(
  62                 data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none),
  63             'uploader': author,
  64             'uploader_id': author,
  65             'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'),
  66             'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '),  # time it got submitted
  67             'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int),
  68             'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int),
  69             'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int),
  70             'age_limit': 18 if data.get('adult') else 0,
  71             'tags': data.get('tags'),
  72         }
  73
  74
  75 class WykopDigIE(WykopBaseExtractor):
  76     IE_NAME = 'wykop:dig'
  77     _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)'
  78
  79     _TESTS = [{
  80         'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
  81         'info_dict': {
  82             'id': 'rlSTBvViflc',
  83             'ext': 'mp4',
  84             'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth',
  85             'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
  86             'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87',
  87             'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'],
  88             'age_limit': 0,
  89             'timestamp': 1669154480,
  90             'release_timestamp': 1669194241,
  91             'release_date': '20221123',
  92             'uploader': 'starnak',
  93             'uploader_id': 'starnak',
  94             'uploader_url': 'https://wykop.pl/ludzie/starnak',
  95             'like_count': int,
  96             'dislike_count': int,
  97             'comment_count': int,
  98             'thumbnail': r're:https?://wykop\.pl/cdn/.+',
  99             'view_count': int,
 100             'channel': 'BBC Earth',
 101             'channel_id': 'UCwmZiChSryoWQCZMIQezgTg',
 102             'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg',
 103             'categories': ['Pets & Animals'],
 104             'upload_date': '20220923',
 105             'duration': 191,
 106             'channel_follower_count': int,
 107             'availability': 'public',
 108             'live_status': 'not_live',
 109             'playable_in_embed': True,
 110         },
 111     }]
 112
 113     @classmethod
 114     def suitable(cls, url):
 115         return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url)
 116
 117     def _real_extract(self, url):
 118         video_id = self._match_id(url)
 119         data = self._call_api(f'links/{video_id}', video_id)['data']
 120
 121         return {
 122             **self._common_data_extract(data),
 123             'id': video_id,
 124             'title': data['title'],
 125             'description': data.get('description'),
 126             # time it got "digged" to the homepage
 127             'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '),
 128         }
 129
 130
 131 class WykopDigCommentIE(WykopBaseExtractor):
 132     IE_NAME = 'wykop:dig:comment'
 133     _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)'
 134
 135     _TESTS = [{
 136         'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g',
 137         'info_dict': {
 138             'id': 'u6tEi2FmKZY',
 139             'ext': 'mp4',
 140             'title': 'md5:e7c741c5baa7ed6478000caf72865577',
 141             'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db',
 142             'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e',
 143             'timestamp': 1674476945,
 144             'uploader': 'Bartholomew',
 145             'uploader_id': 'Bartholomew',
 146             'uploader_url': 'https://wykop.pl/ludzie/Bartholomew',
 147             'thumbnail': r're:https?://wykop\.pl/cdn/.+',
 148             'tags': [],
 149             'availability': 'public',
 150             'duration': 1838,
 151             'upload_date': '20230117',
 152             'categories': ['Entertainment'],
 153             'view_count': int,
 154             'like_count': int,
 155             'dislike_count': int,
 156             'comment_count': int,
 157             'channel_follower_count': int,
 158             'playable_in_embed': True,
 159             'live_status': 'not_live',
 160             'age_limit': 0,
 161             'chapters': 'count:3',
 162             'channel': 'Poszukiwacze Okazji',
 163             'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw',
 164             'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw',
 165         },
 166     }]
 167
 168     def _real_extract(self, url):
 169         dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id'))
 170         data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data']
 171
 172         return {
 173             **self._common_data_extract(data),
 174             'id': comment_id,
 175             'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
 176             'description': data.get('content'),
 177         }
 178
 179
 180 class WykopPostIE(WykopBaseExtractor):
 181     IE_NAME = 'wykop:post'
 182     _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)'
 183
 184     _TESTS = [{
 185         'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek',
 186         'info_dict': {
 187             'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI',
 188             'title': 'PawelW124 - #kot #koty #smiesznykotek',
 189             'description': '#kot #koty #smiesznykotek',
 190             'display_id': 'kot-koty-smiesznykotek',
 191             'tags': ['kot', 'koty', 'smiesznykotek'],
 192             'uploader': 'PawelW124',
 193             'uploader_id': 'PawelW124',
 194             'uploader_url': 'https://wykop.pl/ludzie/PawelW124',
 195             'timestamp': 1668938142,
 196             'age_limit': 0,
 197             'like_count': int,
 198             'dislike_count': int,
 199             'thumbnail': r're:https?://wykop\.pl/cdn/.+',
 200             'comment_count': int,
 201             'channel': 'Revan',
 202             'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw',
 203             'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw',
 204             'upload_date': '20221120',
 205             'modified_date': '20220814',
 206             'availability': 'public',
 207             'view_count': int,
 208         },
 209         'playlist_mincount': 15,
 210         'params': {
 211             'flat_playlist': True,
 212         }
 213     }]
 214
 215     @classmethod
 216     def suitable(cls, url):
 217         return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url)
 218
 219     def _real_extract(self, url):
 220         video_id = self._match_id(url)
 221         data = self._call_api(f'entries/{video_id}', video_id)['data']
 222
 223         return {
 224             **self._common_data_extract(data),
 225             'id': video_id,
 226             'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
 227             'description': data.get('content'),
 228         }
 229
 230
 231 class WykopPostCommentIE(WykopBaseExtractor):
 232     IE_NAME = 'wykop:post:comment'
 233     _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)'
 234
 235     _TESTS = [{
 236         'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979',
 237         'info_dict': {
 238             'id': 'confusedquickarmyant',
 239             'ext': 'mp4',
 240             'title': 'tpap - treść komentarza',
 241             'display_id': 'tresc-komentarza',
 242             'description': 'treść komentarza',
 243             'uploader': 'tpap',
 244             'uploader_id': 'tpap',
 245             'uploader_url': 'https://wykop.pl/ludzie/tpap',
 246             'timestamp': 1675349470,
 247             'upload_date': '20230202',
 248             'tags': [],
 249             'duration': 2.12,
 250             'age_limit': 0,
 251             'categories': [],
 252             'view_count': int,
 253             'like_count': int,
 254             'dislike_count': int,
 255             'thumbnail': r're:https?://wykop\.pl/cdn/.+',
 256         },
 257     }]
 258
 259     def _real_extract(self, url):
 260         post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id'))
 261         data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data']
 262
 263         return {
 264             **self._common_data_extract(data),
 265             'id': comment_id,
 266             'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
 267             'description': data.get('content'),
 268         }