]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/redgifs.py
[extractor/redgifs] Fix extractor (#4892)
[yt-dlp.git] / yt_dlp / extractor / redgifs.py
1 import functools
2
3 from .common import InfoExtractor
4 from ..compat import compat_parse_qs
5 from ..utils import (
6 ExtractorError,
7 int_or_none,
8 qualities,
9 try_get,
10 OnDemandPagedList,
11 )
12
13
14 class RedGifsBaseInfoExtractor(InfoExtractor):
15 _FORMATS = {
16 'gif': 250,
17 'sd': 480,
18 'hd': None,
19 }
20
21 _API_HEADERS = {
22 'referer': 'https://www.redgifs.com/',
23 'origin': 'https://www.redgifs.com',
24 'content-type': 'application/json',
25 }
26
27 def _parse_gif_data(self, gif_data):
28 video_id = gif_data.get('id')
29 quality = qualities(tuple(self._FORMATS.keys()))
30
31 orig_height = int_or_none(gif_data.get('height'))
32 aspect_ratio = try_get(gif_data, lambda x: orig_height / x['width'])
33
34 formats = []
35 for format_id, height in self._FORMATS.items():
36 video_url = gif_data['urls'].get(format_id)
37 if not video_url:
38 continue
39 height = min(orig_height, height or orig_height)
40 formats.append({
41 'url': video_url,
42 'format_id': format_id,
43 'width': height * aspect_ratio if aspect_ratio else None,
44 'height': height,
45 'quality': quality(format_id),
46 })
47 self._sort_formats(formats)
48
49 return {
50 'id': video_id,
51 'webpage_url': f'https://redgifs.com/watch/{video_id}',
52 'extractor_key': RedGifsIE.ie_key(),
53 'extractor': 'RedGifs',
54 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs',
55 'timestamp': int_or_none(gif_data.get('createDate')),
56 'uploader': gif_data.get('userName'),
57 'duration': int_or_none(gif_data.get('duration')),
58 'view_count': int_or_none(gif_data.get('views')),
59 'like_count': int_or_none(gif_data.get('likes')),
60 'categories': gif_data.get('tags') or [],
61 'tags': gif_data.get('tags'),
62 'age_limit': 18,
63 'formats': formats,
64 }
65
66 def _fetch_oauth_token(self, video_id):
67 # These pages contain the OAuth token that is necessary to make API calls.
68 index_page = self._download_webpage(f'https://www.redgifs.com/watch/{video_id}', video_id)
69 index_js_uri = self._html_search_regex(
70 r'href="?(/assets/js/index[.a-z0-9]*.js)"?\W', index_page, 'index_js_uri')
71 index_js = self._download_webpage(f'https://www.redgifs.com/{index_js_uri}', video_id)
72 # It turns out that a { followed by any valid JSON punctuation will always result in the
73 # first two characters of the base64 encoding being "ey".
74 # Use this fact to find any such string constant of a reasonable length with the correct
75 # punctuation for an oauth token
76 oauth_token = self._html_search_regex(
77 r'\w+\s*[=:]\s*"(ey[^"]+\.[^"]*\.[^"]{43,45})"', index_js, 'oauth token')
78 self._API_HEADERS['authorization'] = f'Bearer {oauth_token}'
79
80 def _call_api(self, ep, video_id, *args, **kwargs):
81 if 'authorization' not in self._API_HEADERS:
82 self._fetch_oauth_token(video_id)
83 assert 'authorization' in self._API_HEADERS
84
85 headers = dict(self._API_HEADERS)
86 headers['x-customheader'] = f'https://www.redgifs.com/watch/{video_id}'
87 data = self._download_json(
88 f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs)
89 if 'error' in data:
90 raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id)
91 return data
92
93 def _fetch_page(self, ep, video_id, query, page):
94 query['page'] = page + 1
95 data = self._call_api(
96 ep, video_id, query=query, note=f'Downloading JSON metadata page {page + 1}')
97
98 for entry in data['gifs']:
99 yield self._parse_gif_data(entry)
100
101 def _prepare_api_query(self, query, fields):
102 api_query = [
103 (field_name, query.get(field_name, (default,))[0])
104 for field_name, default in fields.items()]
105
106 return {key: val for key, val in api_query if val is not None}
107
108 def _paged_entries(self, ep, item_id, query, fields):
109 page = int_or_none(query.get('page', (None,))[0])
110 page_fetcher = functools.partial(
111 self._fetch_page, ep, item_id, self._prepare_api_query(query, fields))
112 return page_fetcher(page) if page else OnDemandPagedList(page_fetcher, self._PAGE_SIZE)
113
114
115 class RedGifsIE(RedGifsBaseInfoExtractor):
116 _VALID_URL = r'https?://(?:(?:www\.)?redgifs\.com/watch/|thumbs2\.redgifs\.com/)(?P<id>[^-/?#\.]+)'
117 _TESTS = [{
118 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent',
119 'info_dict': {
120 'id': 'squeakyhelplesswisent',
121 'ext': 'mp4',
122 'title': 'Hotwife Legs Thick',
123 'timestamp': 1636287915,
124 'upload_date': '20211107',
125 'uploader': 'ignored52',
126 'duration': 16,
127 'view_count': int,
128 'like_count': int,
129 'categories': list,
130 'age_limit': 18,
131 'tags': list,
132 }
133 }, {
134 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0',
135 'info_dict': {
136 'id': 'squeakyhelplesswisent',
137 'ext': 'mp4',
138 'title': 'Hotwife Legs Thick',
139 'timestamp': 1636287915,
140 'upload_date': '20211107',
141 'uploader': 'ignored52',
142 'duration': 16,
143 'view_count': int,
144 'like_count': int,
145 'categories': list,
146 'age_limit': 18,
147 'tags': list,
148 }
149 }]
150
151 def _real_extract(self, url):
152 video_id = self._match_id(url).lower()
153 video_info = self._call_api(
154 f'gifs/{video_id}?views=yes', video_id, note='Downloading video info')
155 return self._parse_gif_data(video_info['gif'])
156
157
158 class RedGifsSearchIE(RedGifsBaseInfoExtractor):
159 IE_DESC = 'Redgifs search'
160 _VALID_URL = r'https?://(?:www\.)?redgifs\.com/browse\?(?P<query>[^#]+)'
161 _PAGE_SIZE = 80
162 _TESTS = [
163 {
164 'url': 'https://www.redgifs.com/browse?tags=Lesbian',
165 'info_dict': {
166 'id': 'tags=Lesbian',
167 'title': 'Lesbian',
168 'description': 'RedGifs search for Lesbian, ordered by trending'
169 },
170 'playlist_mincount': 100,
171 },
172 {
173 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian',
174 'info_dict': {
175 'id': 'type=g&order=latest&tags=Lesbian',
176 'title': 'Lesbian',
177 'description': 'RedGifs search for Lesbian, ordered by latest'
178 },
179 'playlist_mincount': 100,
180 },
181 {
182 'url': 'https://www.redgifs.com/browse?type=g&order=latest&tags=Lesbian&page=2',
183 'info_dict': {
184 'id': 'type=g&order=latest&tags=Lesbian&page=2',
185 'title': 'Lesbian',
186 'description': 'RedGifs search for Lesbian, ordered by latest'
187 },
188 'playlist_count': 80,
189 }
190 ]
191
192 def _real_extract(self, url):
193 query_str = self._match_valid_url(url).group('query')
194 query = compat_parse_qs(query_str)
195 if not query.get('tags'):
196 raise ExtractorError('Invalid query tags', expected=True)
197
198 tags = query.get('tags')[0]
199 order = query.get('order', ('trending',))[0]
200
201 query['search_text'] = [tags]
202 entries = self._paged_entries('gifs/search', query_str, query, {
203 'search_text': None,
204 'order': 'trending',
205 'type': None,
206 })
207
208 return self.playlist_result(
209 entries, query_str, tags, f'RedGifs search for {tags}, ordered by {order}')
210
211
212 class RedGifsUserIE(RedGifsBaseInfoExtractor):
213 IE_DESC = 'Redgifs user'
214 _VALID_URL = r'https?://(?:www\.)?redgifs\.com/users/(?P<username>[^/?#]+)(?:\?(?P<query>[^#]+))?'
215 _PAGE_SIZE = 30
216 _TESTS = [
217 {
218 'url': 'https://www.redgifs.com/users/lamsinka89',
219 'info_dict': {
220 'id': 'lamsinka89',
221 'title': 'lamsinka89',
222 'description': 'RedGifs user lamsinka89, ordered by recent'
223 },
224 'playlist_mincount': 100,
225 },
226 {
227 'url': 'https://www.redgifs.com/users/lamsinka89?page=3',
228 'info_dict': {
229 'id': 'lamsinka89?page=3',
230 'title': 'lamsinka89',
231 'description': 'RedGifs user lamsinka89, ordered by recent'
232 },
233 'playlist_count': 30,
234 },
235 {
236 'url': 'https://www.redgifs.com/users/lamsinka89?order=best&type=g',
237 'info_dict': {
238 'id': 'lamsinka89?order=best&type=g',
239 'title': 'lamsinka89',
240 'description': 'RedGifs user lamsinka89, ordered by best'
241 },
242 'playlist_mincount': 100,
243 }
244 ]
245
246 def _real_extract(self, url):
247 username, query_str = self._match_valid_url(url).group('username', 'query')
248 playlist_id = f'{username}?{query_str}' if query_str else username
249
250 query = compat_parse_qs(query_str)
251 order = query.get('order', ('recent',))[0]
252
253 entries = self._paged_entries(f'users/{username}/search', playlist_id, query, {
254 'order': 'recent',
255 'type': None,
256 })
257
258 return self.playlist_result(
259 entries, playlist_id, username, f'RedGifs user {username}, ordered by {order}')