[yt-dlp.git] / yt_dlp / extractor / wykop.py

import json

from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
    ExtractorError,
    format_field,
    parse_iso8601,
    traverse_obj,
    url_or_none,
)


class WykopBaseExtractor(InfoExtractor):
    def _get_token(self, force_refresh=False):
        if not force_refresh:
            maybe_cached = self.cache.load('wykop', 'bearer')
            if maybe_cached:
                return maybe_cached

        new_token = traverse_obj(
            self._do_call_api('auth', None, 'Downloading anonymous auth token', data={
                # hardcoded in frontend
                'key': 'w53947240748',
                'secret': 'd537d9e0a7adc1510842059ae5316419',
            }), ('data', 'token'))

        self.cache.store('wykop', 'bearer', new_token)
        return new_token

    def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}):
        if data:
            data = json.dumps({'data': data}).encode()
            headers['Content-Type'] = 'application/json'

        return self._download_json(
            f'https://wykop.pl/api/v3/{path}', video_id,
            note=note, data=data, headers=headers)

    def _call_api(self, path, video_id, note='Downloading JSON metadata'):
        token = self._get_token()
        for retrying in range(2):
            try:
                return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'})
            except ExtractorError as e:
                if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403:
                    token = self._get_token(True)
                    continue
                raise

    def _common_data_extract(self, data):
        author = traverse_obj(data, ('author', 'username'), expected_type=str)

        return {
            '_type': 'url_transparent',
            'display_id': data.get('slug'),
            'url': traverse_obj(data,
                                ('media', 'embed', 'url'),  # what gets an iframe embed
                                ('source', 'url'),  # clickable url (dig only)
                                expected_type=url_or_none),
            'thumbnail': traverse_obj(
                data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none),
            'uploader': author,
            'uploader_id': author,
            'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'),
            'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '),  # time it got submitted
            'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int),
            'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int),
            'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int),
            'age_limit': 18 if data.get('adult') else 0,
            'tags': data.get('tags'),
        }


class WykopDigIE(WykopBaseExtractor):
    IE_NAME = 'wykop:dig'
    _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)'

    _TESTS = [{
        'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
        'info_dict': {
            'id': 'rlSTBvViflc',
            'ext': 'mp4',
            'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth',
            'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
            'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87',
            'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'],
            'age_limit': 0,
            'timestamp': 1669154480,
            'release_timestamp': 1669194241,
            'release_date': '20221123',
            'uploader': 'starnak',
            'uploader_id': 'starnak',
            'uploader_url': 'https://wykop.pl/ludzie/starnak',
            'like_count': int,
            'dislike_count': int,
            'comment_count': int,
            'thumbnail': r're:https?://wykop\.pl/cdn/.+',
            'view_count': int,
            'channel': 'BBC Earth',
            'channel_id': 'UCwmZiChSryoWQCZMIQezgTg',
            'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg',
            'categories': ['Pets & Animals'],
            'upload_date': '20220923',
            'duration': 191,
            'channel_follower_count': int,
            'availability': 'public',
            'live_status': 'not_live',
            'playable_in_embed': True,
        },
    }]

    @classmethod
    def suitable(cls, url):
        return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url)

    def _real_extract(self, url):
        video_id = self._match_id(url)
        data = self._call_api(f'links/{video_id}', video_id)['data']

        return {
            **self._common_data_extract(data),
            'id': video_id,
            'title': data['title'],
            'description': data.get('description'),
            # time it got "digged" to the homepage
            'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '),
        }


class WykopDigCommentIE(WykopBaseExtractor):
    IE_NAME = 'wykop:dig:comment'
    _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)'

    _TESTS = [{
        'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g',
        'info_dict': {
            'id': 'u6tEi2FmKZY',
            'ext': 'mp4',
            'title': 'md5:e7c741c5baa7ed6478000caf72865577',
            'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db',
            'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e',
            'timestamp': 1674476945,
            'uploader': 'Bartholomew',
            'uploader_id': 'Bartholomew',
            'uploader_url': 'https://wykop.pl/ludzie/Bartholomew',
            'thumbnail': r're:https?://wykop\.pl/cdn/.+',
            'tags': [],
            'availability': 'public',
            'duration': 1838,
            'upload_date': '20230117',
            'categories': ['Entertainment'],
            'view_count': int,
            'like_count': int,
            'dislike_count': int,
            'comment_count': int,
            'channel_follower_count': int,
            'playable_in_embed': True,
            'live_status': 'not_live',
            'age_limit': 0,
            'chapters': 'count:3',
            'channel': 'Poszukiwacze Okazji',
            'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw',
            'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw',
        },
    }]

    def _real_extract(self, url):
        dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id'))
        data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data']

        return {
            **self._common_data_extract(data),
            'id': comment_id,
            'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
            'description': data.get('content'),
        }


class WykopPostIE(WykopBaseExtractor):
    IE_NAME = 'wykop:post'
    _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)'

    _TESTS = [{
        'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek',
        'info_dict': {
            'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI',
            'title': 'PawelW124 - #kot #koty #smiesznykotek',
            'description': '#kot #koty #smiesznykotek',
            'display_id': 'kot-koty-smiesznykotek',
            'tags': ['kot', 'koty', 'smiesznykotek'],
            'uploader': 'PawelW124',
            'uploader_id': 'PawelW124',
            'uploader_url': 'https://wykop.pl/ludzie/PawelW124',
            'timestamp': 1668938142,
            'age_limit': 0,
            'like_count': int,
            'dislike_count': int,
            'thumbnail': r're:https?://wykop\.pl/cdn/.+',
            'comment_count': int,
            'channel': 'Revan',
            'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw',
            'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw',
            'upload_date': '20221120',
            'modified_date': '20220814',
            'availability': 'public',
            'view_count': int,
        },
        'playlist_mincount': 15,
        'params': {
            'flat_playlist': True,
        }
    }]

    @classmethod
    def suitable(cls, url):
        return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url)

    def _real_extract(self, url):
        video_id = self._match_id(url)
        data = self._call_api(f'entries/{video_id}', video_id)['data']

        return {
            **self._common_data_extract(data),
            'id': video_id,
            'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
            'description': data.get('content'),
        }


class WykopPostCommentIE(WykopBaseExtractor):
    IE_NAME = 'wykop:post:comment'
    _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)'

    _TESTS = [{
        'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979',
        'info_dict': {
            'id': 'confusedquickarmyant',
            'ext': 'mp4',
            'title': 'tpap - treść komentarza',
            'display_id': 'tresc-komentarza',
            'description': 'treść komentarza',
            'uploader': 'tpap',
            'uploader_id': 'tpap',
            'uploader_url': 'https://wykop.pl/ludzie/tpap',
            'timestamp': 1675349470,
            'upload_date': '20230202',
            'tags': [],
            'duration': 2.12,
            'age_limit': 0,
            'categories': [],
            'view_count': int,
            'like_count': int,
            'dislike_count': int,
            'thumbnail': r're:https?://wykop\.pl/cdn/.+',
        },
    }]

    def _real_extract(self, url):
        post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id'))
        data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data']

        return {
            **self._common_data_extract(data),
            'id': comment_id,
            'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
            'description': data.get('content'),
        }
Commit	Line	Data
aed945e1	1	import json
aed945e1	2
aed945e1	3	from .common import InfoExtractor
3d2623a8	4	from ..networking.exceptions import HTTPError
aed945e1	5	from ..utils import (
	6	ExtractorError,
	7	format_field,
	8	parse_iso8601,
	9	traverse_obj,
	10	url_or_none,
	11	)
	12
	13
	14	class WykopBaseExtractor(InfoExtractor):
	15	def _get_token(self, force_refresh=False):
	16	if not force_refresh:
	17	maybe_cached = self.cache.load('wykop', 'bearer')
	18	if maybe_cached:
	19	return maybe_cached
	20
	21	new_token = traverse_obj(
	22	self._do_call_api('auth', None, 'Downloading anonymous auth token', data={
	23	# hardcoded in frontend
	24	'key': 'w53947240748',
	25	'secret': 'd537d9e0a7adc1510842059ae5316419',
	26	}), ('data', 'token'))
	27
	28	self.cache.store('wykop', 'bearer', new_token)
	29	return new_token
	30
	31	def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}):
	32	if data:
	33	data = json.dumps({'data': data}).encode()
	34	headers['Content-Type'] = 'application/json'
	35
	36	return self._download_json(
	37	f'https://wykop.pl/api/v3/{path}', video_id,
	38	note=note, data=data, headers=headers)
	39
	40	def _call_api(self, path, video_id, note='Downloading JSON metadata'):
	41	token = self._get_token()
	42	for retrying in range(2):
	43	try:
	44	return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'})
	45	except ExtractorError as e:
3d2623a8	46	if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403:
aed945e1	47	token = self._get_token(True)
	48	continue
	49	raise
	50
	51	def _common_data_extract(self, data):
	52	author = traverse_obj(data, ('author', 'username'), expected_type=str)
	53
	54	return {
	55	'_type': 'url_transparent',
	56	'display_id': data.get('slug'),
	57	'url': traverse_obj(data,
	58	('media', 'embed', 'url'), # what gets an iframe embed
	59	('source', 'url'), # clickable url (dig only)
	60	expected_type=url_or_none),
	61	'thumbnail': traverse_obj(
	62	data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none),
	63	'uploader': author,
	64	'uploader_id': author,
	65	'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'),
	66	'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted
	67	'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int),
	68	'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int),
	69	'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int),
	70	'age_limit': 18 if data.get('adult') else 0,
	71	'tags': data.get('tags'),
	72	}
	73
	74
	75	class WykopDigIE(WykopBaseExtractor):
	76	IE_NAME = 'wykop:dig'
	77	_VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)'
	78
	79	_TESTS = [{
	80	'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
	81	'info_dict': {
	82	'id': 'rlSTBvViflc',
	83	'ext': 'mp4',
	84	'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth',
	85	'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
	86	'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87',
	87	'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'],
	88	'age_limit': 0,
	89	'timestamp': 1669154480,
	90	'release_timestamp': 1669194241,
	91	'release_date': '20221123',
	92	'uploader': 'starnak',
	93	'uploader_id': 'starnak',
	94	'uploader_url': 'https://wykop.pl/ludzie/starnak',
	95	'like_count': int,
	96	'dislike_count': int,
	97	'comment_count': int,
	98	'thumbnail': r're:https?://wykop\.pl/cdn/.+',
	99	'view_count': int,
	100	'channel': 'BBC Earth',
	101	'channel_id': 'UCwmZiChSryoWQCZMIQezgTg',
	102	'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg',
	103	'categories': ['Pets & Animals'],
	104	'upload_date': '20220923',
	105	'duration': 191,
	106	'channel_follower_count': int,
	107	'availability': 'public',
	108	'live_status': 'not_live',
	109	'playable_in_embed': True,
	110	},
111	}]
112
113	@classmethod
114	def suitable(cls, url):
115	return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url)
116
117	def _real_extract(self, url):
118	video_id = self._match_id(url)
119	data = self._call_api(f'links/{video_id}', video_id)['data']
120
121	return {
122	**self._common_data_extract(data),
123	'id': video_id,
124	'title': data['title'],
125	'description': data.get('description'),
126	# time it got "digged" to the homepage
127	'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '),
128	}
129
130
131	class WykopDigCommentIE(WykopBaseExtractor):
132	IE_NAME = 'wykop:dig:comment'
133	_VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)'
134
135	_TESTS = [{
136	'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g',
137	'info_dict': {
138	'id': 'u6tEi2FmKZY',
139	'ext': 'mp4',
140	'title': 'md5:e7c741c5baa7ed6478000caf72865577',
141	'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db',
142	'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e',
143	'timestamp': 1674476945,
144	'uploader': 'Bartholomew',
145	'uploader_id': 'Bartholomew',
146	'uploader_url': 'https://wykop.pl/ludzie/Bartholomew',
147	'thumbnail': r're:https?://wykop\.pl/cdn/.+',
148	'tags': [],
149	'availability': 'public',
150	'duration': 1838,
151	'upload_date': '20230117',
152	'categories': ['Entertainment'],
153	'view_count': int,
154	'like_count': int,
155	'dislike_count': int,
156	'comment_count': int,
157	'channel_follower_count': int,
158	'playable_in_embed': True,
159	'live_status': 'not_live',
160	'age_limit': 0,
161	'chapters': 'count:3',
162	'channel': 'Poszukiwacze Okazji',
163	'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw',
164	'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw',
165	},
166	}]
167
168	def _real_extract(self, url):
169	dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id'))
170	data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data']
171
172	return {
173	**self._common_data_extract(data),
174	'id': comment_id,
175	'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
176	'description': data.get('content'),
177	}
178
179
180	class WykopPostIE(WykopBaseExtractor):
181	IE_NAME = 'wykop:post'
182	_VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)'
183
184	_TESTS = [{
185	'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek',
186	'info_dict': {
187	'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI',
188	'title': 'PawelW124 - #kot #koty #smiesznykotek',
189	'description': '#kot #koty #smiesznykotek',
190	'display_id': 'kot-koty-smiesznykotek',
191	'tags': ['kot', 'koty', 'smiesznykotek'],
192	'uploader': 'PawelW124',
193	'uploader_id': 'PawelW124',
194	'uploader_url': 'https://wykop.pl/ludzie/PawelW124',
195	'timestamp': 1668938142,
196	'age_limit': 0,
197	'like_count': int,
198	'dislike_count': int,
199	'thumbnail': r're:https?://wykop\.pl/cdn/.+',
200	'comment_count': int,
201	'channel': 'Revan',
202	'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw',
203	'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw',
204	'upload_date': '20221120',
205	'modified_date': '20220814',
206	'availability': 'public',
207	'view_count': int,
208	},
209	'playlist_mincount': 15,
210	'params': {
211	'flat_playlist': True,
212	}
213	}]
214
215	@classmethod
216	def suitable(cls, url):
217	return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url)
218
219	def _real_extract(self, url):
220	video_id = self._match_id(url)
221	data = self._call_api(f'entries/{video_id}', video_id)['data']
222
223	return {
224	**self._common_data_extract(data),
225	'id': video_id,
226	'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
227	'description': data.get('content'),
228	}
229
230
231	class WykopPostCommentIE(WykopBaseExtractor):
232	IE_NAME = 'wykop:post:comment'
233	_VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)'
234
235	_TESTS = [{
236	'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979',
237	'info_dict': {
238	'id': 'confusedquickarmyant',
239	'ext': 'mp4',
240	'title': 'tpap - treść komentarza',
241	'display_id': 'tresc-komentarza',
242	'description': 'treść komentarza',
243	'uploader': 'tpap',
244	'uploader_id': 'tpap',
245	'uploader_url': 'https://wykop.pl/ludzie/tpap',
246	'timestamp': 1675349470,
247	'upload_date': '20230202',
248	'tags': [],
249	'duration': 2.12,
250	'age_limit': 0,
251	'categories': [],
252	'view_count': int,
253	'like_count': int,
254	'dislike_count': int,
255	'thumbnail': r're:https?://wykop\.pl/cdn/.+',
256	},
257	}]
258
259	def _real_extract(self, url):
260	post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id'))
261	data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data']
262
263	return {
264	**self._common_data_extract(data),
265	'id': comment_id,
266	'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
267	'description': data.get('content'),
268	}