]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[TikTok] Iterate through app versions (#2449)
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
1ead840d
KS
1# coding: utf-8
2from __future__ import unicode_literals
f7f18f90
A
3
4import itertools
bd9ff55b
M
5import random
6import string
7import time
0fd6661e 8import json
1ead840d
KS
9
10from .common import InfoExtractor
be1f331f
M
11from ..compat import (
12 compat_urllib_parse_unquote,
13 compat_urllib_parse_urlparse
14)
1ead840d 15from ..utils import (
ce18a19b 16 ExtractorError,
1ead840d 17 int_or_none,
34921b43 18 join_nonempty,
b3187433 19 LazyList,
e0585e65 20 srt_subtitles_timecode,
1ead840d 21 str_or_none,
bd9ff55b
M
22 traverse_obj,
23 try_get,
943d5ab1 24 url_or_none,
bd9ff55b 25 qualities,
1ead840d
KS
26)
27
28
0fd6661e 29class TikTokBaseIE(InfoExtractor):
046cab39
M
30 _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')]
31 _WORKING_APP_VERSION = None
943d5ab1
M
32 _APP_NAME = 'trill'
33 _AID = 1180
53dad39e 34 _API_HOSTNAME = 'api-h2.tiktokv.com'
943d5ab1 35 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
53dad39e 36 _WEBPAGE_HOST = 'https://www.tiktok.com/'
be1f331f 37 QUALITIES = ('360p', '540p', '720p', '1080p')
ce18a19b 38
046cab39
M
39 def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
40 note='Downloading API JSON', errnote='Unable to download API page'):
41 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
42 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
43 if webpage_cookies.get('sid_tt'):
44 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
45 return self._download_json(
46 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
47 fatal=fatal, note=note, errnote=errnote, headers={
48 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)',
49 'Accept': 'application/json',
50 }, query=query)
51
52 def _build_api_query(self, query, app_version, manifest_app_version):
53 return {
0fd6661e 54 **query,
046cab39
M
55 'version_name': app_version,
56 'version_code': manifest_app_version,
57 'build_number': app_version,
58 'manifest_version_code': manifest_app_version,
59 'update_version_code': manifest_app_version,
0930b11f 60 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)),
61 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]),
bd9ff55b
M
62 '_rticket': int(time.time() * 1000),
63 'ts': int(time.time()),
64 'device_brand': 'Google',
65 'device_type': 'Pixel 4',
66 'device_platform': 'android',
67 'resolution': '1080*1920',
68 'dpi': 420,
69 'os_version': '10',
70 'os_api': '29',
71 'carrier_region': 'US',
72 'sys_region': 'US',
73 'region': 'US',
943d5ab1 74 'app_name': self._APP_NAME,
bd9ff55b
M
75 'app_language': 'en',
76 'language': 'en',
77 'timezone_name': 'America/New_York',
78 'timezone_offset': '-14400',
79 'channel': 'googleplay',
80 'ac': 'wifi',
81 'mcc_mnc': '310260',
82 'is_my_cn': 0,
943d5ab1 83 'aid': self._AID,
bd9ff55b
M
84 'ssmix': 'a',
85 'as': 'a1qwert123',
86 'cp': 'cbfhckdckkde1',
87 }
046cab39
M
88
89 def _call_api(self, ep, query, video_id, fatal=True,
90 note='Downloading API JSON', errnote='Unable to download API page'):
91 if not self._WORKING_APP_VERSION:
92 app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
93 manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
94 if app_version and manifest_app_version:
95 self._WORKING_APP_VERSION = (app_version, manifest_app_version)
96 self.write_debug('Imported app version combo from extractor arguments')
97 elif app_version or manifest_app_version:
98 self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
99
100 if self._WORKING_APP_VERSION:
101 app_version, manifest_app_version = self._WORKING_APP_VERSION
102 real_query = self._build_api_query(query, app_version, manifest_app_version)
103 return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
104
105 for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
106 real_query = self._build_api_query(query, app_version, manifest_app_version)
107 try:
108 res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
109 self._WORKING_APP_VERSION = (app_version, manifest_app_version)
110 return res
111 except ExtractorError as e:
112 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
113 if count == len(self._APP_VERSIONS):
114 if fatal:
115 raise e
116 else:
117 self.report_warning(str(e.cause or e.msg))
118 return
119 self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
120 continue
121 raise e
0fd6661e 122
e0585e65
M
123 def _get_subtitles(self, aweme_detail, aweme_id):
124 # TODO: Extract text positioning info
125 subtitles = {}
126 captions_info = traverse_obj(
127 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[])
128 for caption in captions_info:
129 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
130 if not caption_url:
131 continue
132 caption_json = self._download_json(
133 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
134 if not caption_json:
135 continue
136 subtitles.setdefault(caption.get('language', 'en'), []).append({
137 'ext': 'srt',
138 'data': '\n\n'.join(
139 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
140 for i, line in enumerate(caption_json['utterances']) if line.get('text'))
141 })
142 return subtitles
143
943d5ab1 144 def _parse_aweme_video_app(self, aweme_detail):
0fd6661e 145 aweme_id = aweme_detail['aweme_id']
bd9ff55b
M
146 video_info = aweme_detail['video']
147
148 def parse_url_key(url_key):
149 format_id, codec, res, bitrate = self._search_regex(
150 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
151 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
152 if not format_id:
153 return {}, None
154 return {
155 'format_id': format_id,
156 'vcodec': 'h265' if codec == 'bytevc1' else codec,
157 'tbr': int_or_none(bitrate, scale=1000) or None,
158 'quality': qualities(self.QUALITIES)(res),
159 }, res
160
161 known_resolutions = {}
162
163 def extract_addr(addr, add_meta={}):
164 parsed_meta, res = parse_url_key(addr.get('url_key', ''))
165 if res:
166 known_resolutions.setdefault(res, {}).setdefault('height', add_meta.get('height'))
167 known_resolutions[res].setdefault('width', add_meta.get('width'))
168 parsed_meta.update(known_resolutions.get(res, {}))
169 add_meta.setdefault('height', int_or_none(res[:-1]))
170 return [{
171 'url': url,
172 'filesize': int_or_none(addr.get('data_size')),
173 'ext': 'mp4',
174 'acodec': 'aac',
0fd6661e
M
175 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
176 **add_meta, **parsed_meta,
34921b43 177 'format_note': join_nonempty(
178 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ')
bd9ff55b
M
179 } for url in addr.get('url_list') or []]
180
181 # Hack: Add direct video links first to prioritize them when removing duplicate formats
182 formats = []
183 if video_info.get('play_addr'):
184 formats.extend(extract_addr(video_info['play_addr'], {
185 'format_id': 'play_addr',
186 'format_note': 'Direct video',
187 'vcodec': 'h265' if traverse_obj(
be1f331f 188 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
bd9ff55b
M
189 'width': video_info.get('width'),
190 'height': video_info.get('height'),
191 }))
192 if video_info.get('download_addr'):
193 formats.extend(extract_addr(video_info['download_addr'], {
194 'format_id': 'download_addr',
195 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
196 'vcodec': 'h264',
197 'width': video_info.get('width'),
198 'height': video_info.get('height'),
0fd6661e 199 'preference': -2 if video_info.get('has_watermark') else -1,
bd9ff55b
M
200 }))
201 if video_info.get('play_addr_h264'):
202 formats.extend(extract_addr(video_info['play_addr_h264'], {
203 'format_id': 'play_addr_h264',
204 'format_note': 'Direct video',
205 'vcodec': 'h264',
206 }))
207 if video_info.get('play_addr_bytevc1'):
208 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
209 'format_id': 'play_addr_bytevc1',
210 'format_note': 'Direct video',
211 'vcodec': 'h265',
212 }))
213
214 for bitrate in video_info.get('bit_rate', []):
215 if bitrate.get('play_addr'):
216 formats.extend(extract_addr(bitrate['play_addr'], {
217 'format_id': bitrate.get('gear_name'),
218 'format_note': 'Playback video',
219 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
220 'vcodec': 'h265' if traverse_obj(
221 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
943d5ab1 222 'fps': bitrate.get('FPS'),
bd9ff55b
M
223 }))
224
225 self._remove_duplicate_formats(formats)
6134fbeb
M
226 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
227 if auth_cookie:
228 for f in formats:
be1f331f 229 self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
0fd6661e 230 self._sort_formats(formats, ('quality', 'codec', 'size', 'br'))
bd9ff55b
M
231
232 thumbnails = []
233 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
234 'origin_cover', 'dynamic_cover'):
235 cover = video_info.get(cover_id)
236 if cover:
237 for cover_url in cover['url_list']:
238 thumbnails.append({
239 'id': cover_id,
240 'url': cover_url,
241 })
242
243 stats_info = aweme_detail.get('statistics', {})
244 author_info = aweme_detail.get('author', {})
245 music_info = aweme_detail.get('music', {})
943d5ab1
M
246 user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
247 'sec_uid', 'id', 'uid', 'unique_id',
248 expected_type=str_or_none, get_all=False))
a8549f19 249 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[])
bd9ff55b
M
250
251 contained_music_track = traverse_obj(
252 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
253 contained_music_author = traverse_obj(
254 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
255
256 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - %s' % music_info.get('owner_handle')
257 if is_generic_og_trackname:
258 music_track, music_author = contained_music_track or 'original sound', contained_music_author
259 else:
260 music_track, music_author = music_info.get('title'), music_info.get('author')
261
262 return {
263 'id': aweme_id,
264 'title': aweme_detail['desc'],
265 'description': aweme_detail['desc'],
266 'view_count': int_or_none(stats_info.get('play_count')),
267 'like_count': int_or_none(stats_info.get('digg_count')),
268 'repost_count': int_or_none(stats_info.get('share_count')),
269 'comment_count': int_or_none(stats_info.get('comment_count')),
270 'uploader': str_or_none(author_info.get('unique_id')),
943d5ab1 271 'creator': str_or_none(author_info.get('nickname')),
bd9ff55b 272 'uploader_id': str_or_none(author_info.get('uid')),
943d5ab1 273 'uploader_url': user_url,
bd9ff55b
M
274 'track': music_track,
275 'album': str_or_none(music_info.get('album')) or None,
276 'artist': music_author,
277 'timestamp': int_or_none(aweme_detail.get('create_time')),
278 'formats': formats,
e0585e65 279 'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
bd9ff55b 280 'thumbnails': thumbnails,
53dad39e
M
281 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
282 'availability': self._availability(
283 is_private='Private' in labels,
284 needs_subscription='Friends only' in labels,
285 is_unlisted='Followers only' in labels)
bd9ff55b
M
286 }
287
0481e266 288 def _parse_aweme_video_web(self, aweme_detail, webpage_url):
943d5ab1 289 video_info = aweme_detail['video']
11aa91a1 290 author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
943d5ab1
M
291 music_info = aweme_detail.get('music') or {}
292 stats_info = aweme_detail.get('stats') or {}
293 user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
294 'secUid', 'id', 'uid', 'uniqueId',
11aa91a1
M
295 expected_type=str_or_none, get_all=False)
296 or aweme_detail.get('authorSecId'))
943d5ab1
M
297
298 formats = []
299 play_url = video_info.get('playAddr')
300 width = video_info.get('width')
301 height = video_info.get('height')
302 if isinstance(play_url, str):
303 formats = [{
304 'url': self._proto_relative_url(play_url),
305 'ext': 'mp4',
306 'width': width,
307 'height': height,
308 }]
309 elif isinstance(play_url, list):
310 formats = [{
311 'url': self._proto_relative_url(url),
312 'ext': 'mp4',
313 'width': width,
314 'height': height,
315 } for url in traverse_obj(play_url, (..., 'src'), expected_type=url_or_none, default=[]) if url]
316
317 download_url = url_or_none(video_info.get('downloadAddr')) or traverse_obj(video_info, ('download', 'url'), expected_type=url_or_none)
318 if download_url:
319 formats.append({
320 'format_id': 'download',
321 'url': self._proto_relative_url(download_url),
322 'ext': 'mp4',
323 'width': width,
324 'height': height,
325 })
326 self._remove_duplicate_formats(formats)
327 self._sort_formats(formats)
328
329 thumbnails = []
330 for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'):
331 if aweme_detail.get(thumbnail_name):
332 thumbnails = [{
333 'url': self._proto_relative_url(aweme_detail[thumbnail_name]),
334 'width': width,
335 'height': height
336 }]
337
338 return {
339 'id': traverse_obj(aweme_detail, 'id', 'awemeId', expected_type=str_or_none),
340 'title': aweme_detail.get('desc'),
341 'duration': try_get(aweme_detail, lambda x: x['video']['duration'], int),
342 'view_count': int_or_none(stats_info.get('playCount')),
343 'like_count': int_or_none(stats_info.get('diggCount')),
344 'repost_count': int_or_none(stats_info.get('shareCount')),
345 'comment_count': int_or_none(stats_info.get('commentCount')),
346 'timestamp': int_or_none(aweme_detail.get('createTime')),
347 'creator': str_or_none(author_info.get('nickname')),
11aa91a1
M
348 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')),
349 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')),
943d5ab1
M
350 'uploader_url': user_url,
351 'track': str_or_none(music_info.get('title')),
352 'album': str_or_none(music_info.get('album')) or None,
353 'artist': str_or_none(music_info.get('authorName')),
354 'formats': formats,
355 'thumbnails': thumbnails,
356 'description': str_or_none(aweme_detail.get('desc')),
357 'http_headers': {
0481e266 358 'Referer': webpage_url
943d5ab1
M
359 }
360 }
361
0fd6661e
M
362
363class TikTokIE(TikTokBaseIE):
364 _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)'
365
366 _TESTS = [{
367 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
0481e266 368 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
0fd6661e
M
369 'info_dict': {
370 'id': '6748451240264420610',
371 'ext': 'mp4',
372 'title': '#jassmanak #lehanga #leenabhushan',
373 'description': '#jassmanak #lehanga #leenabhushan',
374 'duration': 13,
0481e266 375 'height': 1024,
376 'width': 576,
0fd6661e
M
377 'uploader': 'leenabhushan',
378 'uploader_id': '6691488002098119685',
0481e266 379 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
0fd6661e
M
380 'creator': 'facestoriesbyleenabh',
381 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
382 'upload_date': '20191016',
383 'timestamp': 1571246252,
384 'view_count': int,
385 'like_count': int,
386 'repost_count': int,
387 'comment_count': int,
388 }
389 }, {
390 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
0481e266 391 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
0fd6661e
M
392 'info_dict': {
393 'id': '6742501081818877190',
394 'ext': 'mp4',
395 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
396 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
397 'duration': 27,
398 'height': 960,
399 'width': 540,
400 'uploader': 'patrox',
401 'uploader_id': '18702747',
0481e266 402 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
0fd6661e
M
403 'creator': 'patroX',
404 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
405 'upload_date': '20190930',
406 'timestamp': 1569860870,
407 'view_count': int,
408 'like_count': int,
409 'repost_count': int,
410 'comment_count': int,
411 }
412 }, {
96f13f01
M
413 # Banned audio, only available on the app
414 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
415 'info_dict': {
416 'id': '6984138651336838402',
417 'ext': 'mp4',
418 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
419 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
420 'uploader': 'barudakhb_',
421 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
422 'uploader_id': '6974687867511718913',
423 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
424 'track': 'Boka Dance',
425 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
426 'timestamp': 1626121503,
427 'duration': 18,
428 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
429 'upload_date': '20210712',
430 'view_count': int,
431 'like_count': int,
432 'repost_count': int,
433 'comment_count': int,
434 }
435 }, {
436 # Sponsored video, only available with feed workaround
437 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
438 'info_dict': {
439 'id': '7042692929109986561',
440 'ext': 'mp4',
441 'title': 'Slap and Run!',
442 'description': 'Slap and Run!',
443 'uploader': 'user440922249',
444 'creator': 'Slap And Run',
445 'uploader_id': '7036055384943690754',
446 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
447 'track': 'Promoted Music',
448 'timestamp': 1639754738,
449 'duration': 30,
450 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
451 'upload_date': '20211217',
452 'view_count': int,
453 'like_count': int,
454 'repost_count': int,
455 'comment_count': int,
456 },
457 'expected_warnings': ['Video not available']
e0585e65
M
458 }, {
459 # Auto-captions available
460 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
461 'only_matching': True
0fd6661e
M
462 }]
463
0fd6661e 464 def _extract_aweme_app(self, aweme_id):
e540c56f
M
465 try:
466 aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
467 note='Downloading video details', errnote='Unable to download video details').get('aweme_detail')
468 if not aweme_detail:
469 raise ExtractorError('Video not available', video_id=aweme_id)
470 except ExtractorError as e:
471 self.report_warning(f'{e}; Retrying with feed workaround')
472 feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
473 note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
be1f331f 474 aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
e540c56f
M
475 if not aweme_detail:
476 raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
943d5ab1 477 return self._parse_aweme_video_app(aweme_detail)
0fd6661e 478
ce18a19b
S
479 def _real_extract(self, url):
480 video_id = self._match_id(url)
ce18a19b 481
bd9ff55b
M
482 try:
483 return self._extract_aweme_app(video_id)
484 except ExtractorError as e:
485 self.report_warning(f'{e}; Retrying with webpage')
486
7bbc0bbc 487 # If we only call once, we get a 403 when downlaoding the video.
61e76c1e 488 self._download_webpage(url, video_id)
6fb11ca8 489 webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
135dfa2c 490 next_data = self._search_nextjs_data(webpage, video_id, default='{}')
11aa91a1 491
135dfa2c 492 if next_data:
11aa91a1
M
493 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
494 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
495 else:
496 sigi_json = self._search_regex(
497 r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});',
498 webpage, 'sigi data', group='sigi_state')
499 sigi_data = self._parse_json(sigi_json, video_id)
500 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
501 video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
502
1418a043 503 if status == 0:
11aa91a1 504 return self._parse_aweme_video_web(video_data, url)
1418a043 505 elif status == 10216:
506 raise ExtractorError('This video is private', expected=True)
6fb11ca8 507 raise ExtractorError('Video not available', video_id=video_id)
f7f18f90
A
508
509
0fd6661e 510class TikTokUserIE(TikTokBaseIE):
f7f18f90 511 IE_NAME = 'tiktok:user'
0fd6661e 512 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])'
f7f18f90 513 _TESTS = [{
526d74ec 514 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
515 'playlist_mincount': 45,
516 'info_dict': {
517 'id': '6935371178089399301',
0481e266 518 'title': 'corgibobaa',
b3187433 519 'thumbnail': r're:https://.+_1080x1080\.webp'
f7f18f90 520 },
0481e266 521 'expected_warnings': ['Retrying']
f7f18f90
A
522 }, {
523 'url': 'https://www.tiktok.com/@meme',
524 'playlist_mincount': 593,
525 'info_dict': {
526 'id': '79005827461758976',
0481e266 527 'title': 'meme',
b3187433 528 'thumbnail': r're:https://.+_1080x1080\.webp'
f7f18f90 529 },
0481e266 530 'expected_warnings': ['Retrying']
f7f18f90
A
531 }]
532
0fd6661e
M
533 r''' # TODO: Fix by adding _signature to api_url
534 def _entries(self, webpage, user_id, username):
535 secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, username)
f7f18f90
A
536 verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id')
537 if not verifyfp_cookie:
538 raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True)
539 api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor='
540 cursor = '0'
541 for page in itertools.count():
0fd6661e 542 data_json = self._download_json(api_url + cursor, username, note='Downloading Page %d' % page)
f7f18f90
A
543 for video in data_json.get('itemList', []):
544 video_id = video['id']
545 video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}'
bd9ff55b 546 yield self._url_result(video_url, 'TikTok', video_id, str_or_none(video.get('desc')))
526d74ec 547 if not data_json.get('hasMore'):
f7f18f90
A
548 break
549 cursor = data_json['cursor']
0fd6661e
M
550 '''
551
b3187433 552 def _video_entries_api(self, webpage, user_id, username):
0fd6661e
M
553 query = {
554 'user_id': user_id,
555 'count': 21,
556 'max_cursor': 0,
557 'min_cursor': 0,
558 'retry_type': 'no_retry',
0930b11f 559 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
0fd6661e
M
560 }
561
562 max_retries = self.get_param('extractor_retries', 3)
563 for page in itertools.count(1):
564 for retries in itertools.count():
565 try:
566 post_list = self._call_api('aweme/post', query, username,
567 note='Downloading user video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''),
568 errnote='Unable to download user video list')
569 except ExtractorError as e:
570 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries:
571 self.report_warning('%s. Retrying...' % str(e.cause or e.msg))
572 continue
573 raise
574 break
b3187433 575 yield from post_list.get('aweme_list', [])
0fd6661e
M
576 if not post_list.get('has_more'):
577 break
578 query['max_cursor'] = post_list['max_cursor']
f7f18f90 579
b3187433 580 def _entries_api(self, user_id, videos):
581 for video in videos:
582 yield {
583 **self._parse_aweme_video_app(video),
584 'extractor_key': TikTokIE.ie_key(),
585 'extractor': 'TikTok',
586 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
587 }
588
f7f18f90 589 def _real_extract(self, url):
0481e266 590 user_name = self._match_id(url)
591 webpage = self._download_webpage(url, user_name, headers={
0fd6661e
M
592 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
593 })
0481e266 594 user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
b3187433 595
596 videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
597 thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
598
599 return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail)
943d5ab1
M
600
601
8126298c
M
602class TikTokBaseListIE(TikTokBaseIE):
603 def _entries(self, list_id, display_id):
604 query = {
605 self._QUERY_NAME: list_id,
606 'cursor': 0,
607 'count': 20,
608 'type': 5,
609 'device_id': ''.join(random.choice(string.digits) for i in range(19))
610 }
611
612 max_retries = self.get_param('extractor_retries', 3)
613 for page in itertools.count(1):
614 for retries in itertools.count():
615 try:
616 post_list = self._call_api(self._API_ENDPOINT, query, display_id,
617 note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''),
618 errnote='Unable to download video list')
619 except ExtractorError as e:
620 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries:
621 self.report_warning('%s. Retrying...' % str(e.cause or e.msg))
622 continue
623 raise
624 break
625 for video in post_list.get('aweme_list', []):
626 yield {
627 **self._parse_aweme_video_app(video),
0b77924a 628 'extractor_key': TikTokIE.ie_key(),
8126298c
M
629 'extractor': 'TikTok',
630 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
631 }
632 if not post_list.get('has_more'):
633 break
634 query['cursor'] = post_list['cursor']
635
636 def _real_extract(self, url):
637 list_id = self._match_id(url)
638 return self.playlist_result(self._entries(list_id, list_id), list_id)
639
640
641class TikTokSoundIE(TikTokBaseListIE):
642 IE_NAME = 'tiktok:sound'
643 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
644 _QUERY_NAME = 'music_id'
645 _API_ENDPOINT = 'music/aweme'
646 _TESTS = [{
647 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
648 'playlist_mincount': 100,
649 'info_dict': {
650 'id': '6956990112127585029'
651 },
652 'expected_warnings': ['Retrying']
653 }, {
654 # Actual entries are less than listed video count
655 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
656 'playlist_mincount': 2182,
657 'info_dict': {
658 'id': '7036843036118469381'
659 },
660 'expected_warnings': ['Retrying']
661 }]
662
663
664class TikTokEffectIE(TikTokBaseListIE):
665 IE_NAME = 'tiktok:effect'
666 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
667 _QUERY_NAME = 'sticker_id'
668 _API_ENDPOINT = 'sticker/aweme'
669 _TESTS = [{
670 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
671 'playlist_mincount': 100,
672 'info_dict': {
673 'id': '1258156',
674 },
675 'expected_warnings': ['Retrying']
676 }, {
677 # Different entries between mobile and web, depending on region
678 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
679 'only_matching': True
680 }]
681
682
683class TikTokTagIE(TikTokBaseListIE):
684 IE_NAME = 'tiktok:tag'
685 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
686 _QUERY_NAME = 'ch_id'
687 _API_ENDPOINT = 'challenge/aweme'
688 _TESTS = [{
689 'url': 'https://tiktok.com/tag/hello2018',
690 'playlist_mincount': 39,
691 'info_dict': {
692 'id': '46294678',
693 'title': 'hello2018',
694 },
695 'expected_warnings': ['Retrying']
696 }, {
697 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
698 'only_matching': True
699 }]
700
701 def _real_extract(self, url):
702 display_id = self._match_id(url)
703 webpage = self._download_webpage(url, display_id, headers={
704 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
705 })
706 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
707 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
708
709
943d5ab1
M
710class DouyinIE(TikTokIE):
711 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
712 _TESTS = [{
713 'url': 'https://www.douyin.com/video/6961737553342991651',
714 'md5': '10523312c8b8100f353620ac9dc8f067',
715 'info_dict': {
716 'id': '6961737553342991651',
717 'ext': 'mp4',
718 'title': '#杨超越 小小水手带你去远航❤️',
719 'uploader': '杨超越',
720 'upload_date': '20210513',
721 'timestamp': 1620905839,
722 'uploader_id': '110403406559',
723 'view_count': int,
724 'like_count': int,
725 'repost_count': int,
726 'comment_count': int,
727 }
728 }, {
729 'url': 'https://www.douyin.com/video/6982497745948921092',
730 'md5': 'd78408c984b9b5102904cf6b6bc2d712',
731 'info_dict': {
732 'id': '6982497745948921092',
733 'ext': 'mp4',
734 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
735 'uploader': '杨超越工作室',
736 'upload_date': '20210708',
737 'timestamp': 1625739481,
738 'uploader_id': '408654318141572',
739 'view_count': int,
740 'like_count': int,
741 'repost_count': int,
742 'comment_count': int,
743 }
744 }, {
745 'url': 'https://www.douyin.com/video/6953975910773099811',
746 'md5': '72e882e24f75064c218b76c8b713c185',
747 'info_dict': {
748 'id': '6953975910773099811',
749 'ext': 'mp4',
750 'title': '#一起看海 出现在你的夏日里',
751 'uploader': '杨超越',
752 'upload_date': '20210422',
753 'timestamp': 1619098692,
754 'uploader_id': '110403406559',
755 'view_count': int,
756 'like_count': int,
757 'repost_count': int,
758 'comment_count': int,
759 }
760 }, {
761 'url': 'https://www.douyin.com/video/6950251282489675042',
762 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
763 'info_dict': {
764 'id': '6950251282489675042',
765 'ext': 'mp4',
766 'title': '哈哈哈,成功了哈哈哈哈哈哈',
767 'uploader': '杨超越',
768 'upload_date': '20210412',
769 'timestamp': 1618231483,
770 'uploader_id': '110403406559',
771 'view_count': int,
772 'like_count': int,
773 'repost_count': int,
774 'comment_count': int,
775 }
776 }, {
777 'url': 'https://www.douyin.com/video/6963263655114722595',
778 'md5': '1abe1c477d05ee62efb40bf2329957cf',
779 'info_dict': {
780 'id': '6963263655114722595',
781 'ext': 'mp4',
782 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
783 'uploader': '杨超越',
784 'upload_date': '20210517',
785 'timestamp': 1621261163,
786 'uploader_id': '110403406559',
787 'view_count': int,
788 'like_count': int,
789 'repost_count': int,
790 'comment_count': int,
791 }
792 }]
046cab39 793 _APP_VERSIONS = [('9.6.0', '960')]
943d5ab1
M
794 _APP_NAME = 'aweme'
795 _AID = 1128
796 _API_HOSTNAME = 'aweme.snssdk.com'
797 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
53dad39e 798 _WEBPAGE_HOST = 'https://www.douyin.com/'
943d5ab1
M
799
800 def _real_extract(self, url):
801 video_id = self._match_id(url)
802
803 try:
804 return self._extract_aweme_app(video_id)
805 except ExtractorError as e:
806 self.report_warning(f'{e}; Retrying with webpage')
807
808 webpage = self._download_webpage(url, video_id)
809 render_data_json = self._search_regex(
810 r'<script [^>]*\bid=[\'"]RENDER_DATA[\'"][^>]*>(%7B.+%7D)</script>',
811 webpage, 'render data', default=None)
812 if not render_data_json:
813 # TODO: Run verification challenge code to generate signature cookies
814 raise ExtractorError('Fresh cookies (not necessarily logged in) are needed')
815
816 render_data = self._parse_json(
817 render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
818 return self._parse_aweme_video_web(
0481e266 819 traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url)