]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[ie/roosterteeth] Extract release date and timestamp (#9393)
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
f7f18f90 1import itertools
b801cd71 2import json
bd9ff55b 3import random
216bcb66 4import re
bd9ff55b
M
5import string
6import time
1ead840d
KS
7
8from .common import InfoExtractor
9ff94664 9from ..compat import compat_urllib_parse_urlparse
3d2623a8 10from ..networking import HEADRequest
1ead840d 11from ..utils import (
ce18a19b 12 ExtractorError,
b801cd71 13 LazyList,
11e1c2e3 14 UnsupportedError,
933ed882 15 UserNotLive,
8ceb07e8 16 determine_ext,
216bcb66 17 format_field,
1ead840d 18 int_or_none,
34921b43 19 join_nonempty,
216bcb66 20 merge_dicts,
b801cd71 21 qualities,
ba723997 22 remove_start,
e0585e65 23 srt_subtitles_timecode,
1ead840d 24 str_or_none,
bd9ff55b 25 traverse_obj,
216bcb66 26 try_call,
bd9ff55b 27 try_get,
943d5ab1 28 url_or_none,
1ead840d
KS
29)
30
31
0fd6661e 32class TikTokBaseIE(InfoExtractor):
f7c5a5e9 33 _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')]
046cab39 34 _WORKING_APP_VERSION = None
943d5ab1
M
35 _APP_NAME = 'trill'
36 _AID = 1180
943d5ab1 37 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
53dad39e 38 _WEBPAGE_HOST = 'https://www.tiktok.com/'
be1f331f 39 QUALITIES = ('360p', '540p', '720p', '1080p')
ce18a19b 40
c4cbd3be 41 @property
42 def _API_HOSTNAME(self):
43 return self._configuration_arg(
44 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
45
b801cd71 46 @staticmethod
47 def _create_url(user_id, video_id):
48 return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
49
a39a7ba8 50 def _get_sigi_state(self, webpage, display_id):
069cbece 51 return self._search_json(
52 r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
d9b4154c 53 'sigi state', display_id, end_pattern=r'</script>', default={})
54
55 def _get_universal_data(self, webpage, display_id):
56 return traverse_obj(self._search_json(
57 r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
58 'universal data', display_id, end_pattern=r'</script>', default={}),
59 ('__DEFAULT_SCOPE__', {dict})) or {}
a39a7ba8 60
046cab39
M
61 def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
62 note='Downloading API JSON', errnote='Unable to download API page'):
efa944f4 63 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
046cab39
M
64 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
65 if webpage_cookies.get('sid_tt'):
66 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
67 return self._download_json(
68 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
69 fatal=fatal, note=note, errnote=errnote, headers={
c2a1bdb0 70 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)',
046cab39
M
71 'Accept': 'application/json',
72 }, query=query)
73
74 def _build_api_query(self, query, app_version, manifest_app_version):
75 return {
0fd6661e 76 **query,
046cab39
M
77 'version_name': app_version,
78 'version_code': manifest_app_version,
79 'build_number': app_version,
80 'manifest_version_code': manifest_app_version,
81 'update_version_code': manifest_app_version,
efa944f4
AM
82 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
83 'uuid': ''.join(random.choices(string.digits, k=16)),
bd9ff55b
M
84 '_rticket': int(time.time() * 1000),
85 'ts': int(time.time()),
86 'device_brand': 'Google',
c2a1bdb0 87 'device_type': 'Pixel 7',
bd9ff55b 88 'device_platform': 'android',
c2a1bdb0 89 'resolution': '1080*2400',
bd9ff55b 90 'dpi': 420,
c2a1bdb0 91 'os_version': '13',
bd9ff55b
M
92 'os_api': '29',
93 'carrier_region': 'US',
94 'sys_region': 'US',
95 'region': 'US',
943d5ab1 96 'app_name': self._APP_NAME,
bd9ff55b
M
97 'app_language': 'en',
98 'language': 'en',
99 'timezone_name': 'America/New_York',
100 'timezone_offset': '-14400',
101 'channel': 'googleplay',
102 'ac': 'wifi',
103 'mcc_mnc': '310260',
104 'is_my_cn': 0,
943d5ab1 105 'aid': self._AID,
bd9ff55b
M
106 'ssmix': 'a',
107 'as': 'a1qwert123',
108 'cp': 'cbfhckdckkde1',
109 }
046cab39
M
110
111 def _call_api(self, ep, query, video_id, fatal=True,
112 note='Downloading API JSON', errnote='Unable to download API page'):
113 if not self._WORKING_APP_VERSION:
114 app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
115 manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
116 if app_version and manifest_app_version:
117 self._WORKING_APP_VERSION = (app_version, manifest_app_version)
118 self.write_debug('Imported app version combo from extractor arguments')
119 elif app_version or manifest_app_version:
120 self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
121
122 if self._WORKING_APP_VERSION:
123 app_version, manifest_app_version = self._WORKING_APP_VERSION
124 real_query = self._build_api_query(query, app_version, manifest_app_version)
125 return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
126
127 for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
128 real_query = self._build_api_query(query, app_version, manifest_app_version)
129 try:
130 res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
131 self._WORKING_APP_VERSION = (app_version, manifest_app_version)
132 return res
133 except ExtractorError as e:
134 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
135 if count == len(self._APP_VERSIONS):
136 if fatal:
137 raise e
138 else:
139 self.report_warning(str(e.cause or e.msg))
140 return
141 self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
142 continue
143 raise e
0fd6661e 144
ba723997 145 def _extract_aweme_app(self, aweme_id):
146 feed_list = self._call_api(
147 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed',
148 errnote='Unable to download video feed').get('aweme_list') or []
149 aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
150 if not aweme_detail:
151 raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
152 return self._parse_aweme_video_app(aweme_detail)
153
e0585e65
M
154 def _get_subtitles(self, aweme_detail, aweme_id):
155 # TODO: Extract text positioning info
156 subtitles = {}
ba723997 157 # aweme/detail endpoint subs
e0585e65 158 captions_info = traverse_obj(
ba723997 159 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
e0585e65
M
160 for caption in captions_info:
161 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
162 if not caption_url:
163 continue
164 caption_json = self._download_json(
165 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
166 if not caption_json:
167 continue
168 subtitles.setdefault(caption.get('language', 'en'), []).append({
169 'ext': 'srt',
170 'data': '\n\n'.join(
171 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
172 for i, line in enumerate(caption_json['utterances']) if line.get('text'))
173 })
ba723997 174 # feed endpoint subs
175 if not subtitles:
176 for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
177 if not caption.get('url'):
178 continue
179 subtitles.setdefault(caption.get('lang') or 'en', []).append({
180 'ext': remove_start(caption.get('caption_format'), 'web'),
181 'url': caption['url'],
182 })
183 # webpage subs
184 if not subtitles:
185 for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', ...), expected_type=dict):
186 if not caption.get('Url'):
187 continue
188 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
189 'ext': remove_start(caption.get('Format'), 'web'),
190 'url': caption['Url'],
191 })
e0585e65
M
192 return subtitles
193
943d5ab1 194 def _parse_aweme_video_app(self, aweme_detail):
0fd6661e 195 aweme_id = aweme_detail['aweme_id']
bd9ff55b
M
196 video_info = aweme_detail['video']
197
198 def parse_url_key(url_key):
199 format_id, codec, res, bitrate = self._search_regex(
200 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
201 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
202 if not format_id:
203 return {}, None
204 return {
205 'format_id': format_id,
206 'vcodec': 'h265' if codec == 'bytevc1' else codec,
207 'tbr': int_or_none(bitrate, scale=1000) or None,
208 'quality': qualities(self.QUALITIES)(res),
209 }, res
210
211 known_resolutions = {}
212
b09bd0c1 213 def audio_meta(url):
214 ext = determine_ext(url, default_ext='m4a')
8ceb07e8 215 return {
216 'format_note': 'Music track',
b09bd0c1 217 'ext': ext,
218 'acodec': 'aac' if ext == 'm4a' else ext,
8ceb07e8 219 'vcodec': 'none',
220 'width': None,
221 'height': None,
b09bd0c1 222 } if ext == 'mp3' or '-music-' in url else {}
8ceb07e8 223
bd9ff55b
M
224 def extract_addr(addr, add_meta={}):
225 parsed_meta, res = parse_url_key(addr.get('url_key', ''))
226 if res:
9ff94664 227 known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
228 known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
bd9ff55b
M
229 parsed_meta.update(known_resolutions.get(res, {}))
230 add_meta.setdefault('height', int_or_none(res[:-1]))
231 return [{
232 'url': url,
233 'filesize': int_or_none(addr.get('data_size')),
234 'ext': 'mp4',
235 'acodec': 'aac',
0fd6661e
M
236 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
237 **add_meta, **parsed_meta,
34921b43 238 'format_note': join_nonempty(
8ceb07e8 239 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '),
b09bd0c1 240 **audio_meta(url),
bd9ff55b
M
241 } for url in addr.get('url_list') or []]
242
243 # Hack: Add direct video links first to prioritize them when removing duplicate formats
244 formats = []
9ff94664 245 width = int_or_none(video_info.get('width'))
246 height = int_or_none(video_info.get('height'))
bd9ff55b
M
247 if video_info.get('play_addr'):
248 formats.extend(extract_addr(video_info['play_addr'], {
249 'format_id': 'play_addr',
250 'format_note': 'Direct video',
251 'vcodec': 'h265' if traverse_obj(
be1f331f 252 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
9ff94664 253 'width': width,
254 'height': height,
bd9ff55b
M
255 }))
256 if video_info.get('download_addr'):
9ff94664 257 download_addr = video_info['download_addr']
258 dl_width = int_or_none(download_addr.get('width'))
259 formats.extend(extract_addr(download_addr, {
bd9ff55b
M
260 'format_id': 'download_addr',
261 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
262 'vcodec': 'h264',
9ff94664 263 'width': dl_width or width,
264 'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong
0fd6661e 265 'preference': -2 if video_info.get('has_watermark') else -1,
bd9ff55b
M
266 }))
267 if video_info.get('play_addr_h264'):
268 formats.extend(extract_addr(video_info['play_addr_h264'], {
269 'format_id': 'play_addr_h264',
270 'format_note': 'Direct video',
271 'vcodec': 'h264',
272 }))
273 if video_info.get('play_addr_bytevc1'):
274 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
275 'format_id': 'play_addr_bytevc1',
276 'format_note': 'Direct video',
277 'vcodec': 'h265',
278 }))
279
280 for bitrate in video_info.get('bit_rate', []):
281 if bitrate.get('play_addr'):
282 formats.extend(extract_addr(bitrate['play_addr'], {
283 'format_id': bitrate.get('gear_name'),
284 'format_note': 'Playback video',
285 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
286 'vcodec': 'h265' if traverse_obj(
287 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
943d5ab1 288 'fps': bitrate.get('FPS'),
bd9ff55b
M
289 }))
290
291 self._remove_duplicate_formats(formats)
6134fbeb
M
292 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
293 if auth_cookie:
294 for f in formats:
be1f331f 295 self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
bd9ff55b
M
296
297 thumbnails = []
298 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
299 'origin_cover', 'dynamic_cover'):
92593690 300 for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
301 thumbnails.append({
302 'id': cover_id,
303 'url': cover_url,
304 })
305
306 stats_info = aweme_detail.get('statistics') or {}
307 author_info = aweme_detail.get('author') or {}
308 music_info = aweme_detail.get('music') or {}
943d5ab1
M
309 user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
310 'sec_uid', 'id', 'uid', 'unique_id',
311 expected_type=str_or_none, get_all=False))
6839ae1f 312 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
bd9ff55b
M
313
314 contained_music_track = traverse_obj(
315 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
316 contained_music_author = traverse_obj(
317 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
318
319 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - %s' % music_info.get('owner_handle')
320 if is_generic_og_trackname:
321 music_track, music_author = contained_music_track or 'original sound', contained_music_author
322 else:
323 music_track, music_author = music_info.get('title'), music_info.get('author')
324
325 return {
326 'id': aweme_id,
92593690 327 **traverse_obj(aweme_detail, {
328 'title': ('desc', {str}),
329 'description': ('desc', {str}),
330 'timestamp': ('create_time', {int_or_none}),
331 }),
332 **traverse_obj(stats_info, {
333 'view_count': 'play_count',
334 'like_count': 'digg_count',
335 'repost_count': 'share_count',
336 'comment_count': 'comment_count',
337 }, expected_type=int_or_none),
338 **traverse_obj(author_info, {
339 'uploader': 'unique_id',
340 'uploader_id': 'uid',
341 'creator': 'nickname',
342 'channel_id': 'sec_uid',
343 }, expected_type=str_or_none),
943d5ab1 344 'uploader_url': user_url,
bd9ff55b
M
345 'track': music_track,
346 'album': str_or_none(music_info.get('album')) or None,
f7c5a5e9 347 'artist': music_author or None,
bd9ff55b 348 'formats': formats,
e0585e65 349 'subtitles': self.extract_subtitles(aweme_detail, aweme_id),
bd9ff55b 350 'thumbnails': thumbnails,
53dad39e
M
351 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
352 'availability': self._availability(
353 is_private='Private' in labels,
354 needs_subscription='Friends only' in labels,
9f14daf2 355 is_unlisted='Followers only' in labels),
356 '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
bd9ff55b
M
357 }
358
92593690 359 def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id):
943d5ab1 360 video_info = aweme_detail['video']
11aa91a1 361 author_info = traverse_obj(aweme_detail, 'authorInfo', 'author', expected_type=dict, default={})
943d5ab1
M
362 music_info = aweme_detail.get('music') or {}
363 stats_info = aweme_detail.get('stats') or {}
92593690 364 channel_id = traverse_obj(author_info or aweme_detail, (('authorSecId', 'secUid'), {str}), get_all=False)
365 user_url = self._UPLOADER_URL_FORMAT % channel_id if channel_id else None
943d5ab1
M
366
367 formats = []
92593690 368 width = int_or_none(video_info.get('width'))
369 height = int_or_none(video_info.get('height'))
370
371 for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
372 formats.append({
943d5ab1
M
373 'url': self._proto_relative_url(play_url),
374 'ext': 'mp4',
375 'width': width,
376 'height': height,
92593690 377 })
943d5ab1 378
92593690 379 for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
943d5ab1
M
380 formats.append({
381 'format_id': 'download',
382 'url': self._proto_relative_url(download_url),
383 'ext': 'mp4',
384 'width': width,
385 'height': height,
386 })
92593690 387
943d5ab1 388 self._remove_duplicate_formats(formats)
943d5ab1
M
389
390 thumbnails = []
92593690 391 for thumb_url in traverse_obj(aweme_detail, (
392 (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})):
393 thumbnails.append({
394 'url': self._proto_relative_url(thumb_url),
395 'width': width,
396 'height': height,
397 })
943d5ab1
M
398
399 return {
92593690 400 'id': video_id,
401 **traverse_obj(aweme_detail, {
402 'title': ('desc', {str}),
403 'description': ('desc', {str}),
404 'duration': ('video', 'duration', {int_or_none}),
405 'timestamp': ('createTime', {int_or_none}),
406 }),
407 **traverse_obj(author_info or aweme_detail, {
408 'creator': ('nickname', {str}),
409 'uploader': (('uniqueId', 'author'), {str}),
410 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
411 }, get_all=False),
412 **traverse_obj(stats_info, {
413 'view_count': 'playCount',
414 'like_count': 'diggCount',
415 'repost_count': 'shareCount',
416 'comment_count': 'commentCount',
417 }, expected_type=int_or_none),
418 **traverse_obj(music_info, {
419 'track': 'title',
420 'album': ('album', {lambda x: x or None}),
421 'artist': 'authorName',
422 }, expected_type=str),
423 'channel_id': channel_id,
943d5ab1 424 'uploader_url': user_url,
943d5ab1
M
425 'formats': formats,
426 'thumbnails': thumbnails,
943d5ab1 427 'http_headers': {
92593690 428 'Referer': webpage_url,
943d5ab1
M
429 }
430 }
431
0fd6661e
M
432
433class TikTokIE(TikTokBaseIE):
c4cbd3be 434 _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
bfd973ec 435 _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
0fd6661e
M
436
437 _TESTS = [{
438 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
0481e266 439 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
0fd6661e
M
440 'info_dict': {
441 'id': '6748451240264420610',
442 'ext': 'mp4',
443 'title': '#jassmanak #lehanga #leenabhushan',
444 'description': '#jassmanak #lehanga #leenabhushan',
445 'duration': 13,
0481e266 446 'height': 1024,
447 'width': 576,
0fd6661e
M
448 'uploader': 'leenabhushan',
449 'uploader_id': '6691488002098119685',
0481e266 450 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
0fd6661e
M
451 'creator': 'facestoriesbyleenabh',
452 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
453 'upload_date': '20191016',
454 'timestamp': 1571246252,
455 'view_count': int,
456 'like_count': int,
457 'repost_count': int,
458 'comment_count': int,
a44ca5a4 459 'artist': 'Ysrbeats',
460 'album': 'Lehanga',
461 'track': 'Lehanga',
92593690 462 },
463 'skip': '404 Not Found',
0fd6661e
M
464 }, {
465 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
0481e266 466 'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
0fd6661e
M
467 'info_dict': {
468 'id': '6742501081818877190',
469 'ext': 'mp4',
470 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
471 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
472 'duration': 27,
473 'height': 960,
474 'width': 540,
475 'uploader': 'patrox',
476 'uploader_id': '18702747',
0481e266 477 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
92593690 478 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
0fd6661e
M
479 'creator': 'patroX',
480 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
481 'upload_date': '20190930',
482 'timestamp': 1569860870,
483 'view_count': int,
484 'like_count': int,
485 'repost_count': int,
486 'comment_count': int,
a44ca5a4 487 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
488 'track': 'Big Fun',
92593690 489 },
0fd6661e 490 }, {
96f13f01
M
491 # Banned audio, only available on the app
492 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
493 'info_dict': {
494 'id': '6984138651336838402',
495 'ext': 'mp4',
496 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
497 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
498 'uploader': 'barudakhb_',
499 'creator': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
500 'uploader_id': '6974687867511718913',
501 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
92593690 502 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
96f13f01
M
503 'track': 'Boka Dance',
504 'artist': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
505 'timestamp': 1626121503,
506 'duration': 18,
507 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
508 'upload_date': '20210712',
509 'view_count': int,
510 'like_count': int,
511 'repost_count': int,
512 'comment_count': int,
92593690 513 },
96f13f01
M
514 }, {
515 # Sponsored video, only available with feed workaround
516 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
517 'info_dict': {
518 'id': '7042692929109986561',
519 'ext': 'mp4',
520 'title': 'Slap and Run!',
521 'description': 'Slap and Run!',
522 'uploader': 'user440922249',
523 'creator': 'Slap And Run',
524 'uploader_id': '7036055384943690754',
525 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
92593690 526 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
96f13f01
M
527 'track': 'Promoted Music',
528 'timestamp': 1639754738,
529 'duration': 30,
530 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
531 'upload_date': '20211217',
532 'view_count': int,
533 'like_count': int,
534 'repost_count': int,
535 'comment_count': int,
536 },
b09bd0c1 537 'params': {'skip_download': True}, # XXX: unable to download video data: HTTP Error 403: Forbidden
5fa3c9a8
HTL
538 }, {
539 # Video without title and description
540 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
541 'info_dict': {
542 'id': '7059698374567611694',
543 'ext': 'mp4',
b801cd71 544 'title': 'TikTok video #7059698374567611694',
5fa3c9a8
HTL
545 'description': '',
546 'uploader': 'pokemonlife22',
547 'creator': 'Pokemon',
548 'uploader_id': '6820838815978423302',
549 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
92593690 550 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8
HTL
551 'track': 'original sound',
552 'timestamp': 1643714123,
553 'duration': 6,
554 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
555 'upload_date': '20220201',
556 'artist': 'Pokemon',
557 'view_count': int,
558 'like_count': int,
559 'repost_count': int,
560 'comment_count': int,
561 },
a39a7ba8 562 }, {
563 # hydration JSON is sent in a <script> element
564 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
565 'info_dict': {
566 'id': '7065799023130643713',
567 'ext': 'mp4',
568 'title': '#denidil#денидил',
569 'description': '#denidil#денидил',
570 'uploader': 'denidil6',
571 'uploader_id': '7046664115636405250',
572 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
573 'artist': 'Holocron Music',
574 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
575 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
576 'timestamp': 1645134536,
577 'duration': 26,
578 'upload_date': '20220217',
579 'view_count': int,
580 'like_count': int,
581 'repost_count': int,
582 'comment_count': int,
583 },
f7c5a5e9 584 'skip': 'This video is unavailable',
8ceb07e8 585 }, {
586 # slideshow audio-only mp3 format
587 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
588 'info_dict': {
589 'id': '7139980461132074283',
590 'ext': 'mp3',
591 'title': 'TikTok video #7139980461132074283',
592 'description': '',
593 'creator': 'Antaura',
594 'uploader': '_le_cannibale_',
595 'uploader_id': '6604511138619654149',
596 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
92593690 597 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
8ceb07e8 598 'artist': 'nathan !',
599 'track': 'grahamscott canon',
600 'upload_date': '20220905',
601 'timestamp': 1662406249,
602 'view_count': int,
603 'like_count': int,
604 'repost_count': int,
605 'comment_count': int,
606 'thumbnail': r're:^https://.+\.webp',
607 },
92593690 608 }, {
609 # only available via web
610 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
b09bd0c1 611 'md5': '6aba7fad816e8709ff2c149679ace165',
92593690 612 'info_dict': {
613 'id': '7206382937372134662',
614 'ext': 'mp4',
615 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
616 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
617 'creator': 'MoxyPatch',
d9b4154c 618 'creators': ['MoxyPatch'],
92593690 619 'uploader': 'moxypatch',
620 'uploader_id': '7039142049363379205',
621 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
622 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
d9b4154c 623 'artists': ['your worst nightmare'],
92593690 624 'track': 'original sound',
625 'upload_date': '20230303',
626 'timestamp': 1677866781,
627 'duration': 10,
628 'view_count': int,
629 'like_count': int,
630 'repost_count': int,
631 'comment_count': int,
632 'thumbnail': r're:^https://.+',
633 'thumbnails': 'count:3',
634 },
635 'expected_warnings': ['Unable to find video in feed'],
c2a1bdb0 636 }, {
637 # 1080p format
638 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830',
639 'md5': '982512017a8a917124d5a08c8ae79621',
640 'info_dict': {
641 'id': '7107337212743830830',
642 'ext': 'mp4',
643 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
644 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
645 'uploader': 'tatemcrae',
646 'uploader_id': '86328792343818240',
647 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
648 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
b09bd0c1 649 'creator': 'tate mcrae',
650 'artist': 'tate mcrae',
c2a1bdb0 651 'track': 'original sound',
652 'upload_date': '20220609',
653 'timestamp': 1654805899,
654 'duration': 150,
655 'view_count': int,
656 'like_count': int,
657 'repost_count': int,
658 'comment_count': int,
659 'thumbnail': r're:^https://.+\.webp',
660 },
d9b4154c 661 'skip': 'Unavailable via feed API, no formats available via web',
b09bd0c1 662 }, {
663 # Slideshow, audio-only m4a format
664 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
665 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
666 'info_dict': {
667 'id': '7253412088251534594',
668 'ext': 'm4a',
669 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
670 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
671 'uploader': 'hara_yoimiya',
672 'uploader_id': '6582536342634676230',
673 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
674 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
675 'creator': 'лампочка',
676 'artist': 'Øneheart',
677 'album': 'watching the stars',
678 'track': 'watching the stars',
679 'upload_date': '20230708',
680 'timestamp': 1688816612,
681 'view_count': int,
682 'like_count': int,
683 'comment_count': int,
684 'repost_count': int,
685 'thumbnail': r're:^https://.+\.webp',
686 },
e0585e65
M
687 }, {
688 # Auto-captions available
689 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
690 'only_matching': True
0fd6661e
M
691 }]
692
ce18a19b 693 def _real_extract(self, url):
b801cd71 694 video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
bd9ff55b
M
695 try:
696 return self._extract_aweme_app(video_id)
697 except ExtractorError as e:
d9b4154c 698 e.expected = True
a39a7ba8 699 self.report_warning(f'{e}; trying with webpage')
bd9ff55b 700
a39a7ba8 701 url = self._create_url(user_id, video_id)
216bcb66 702 webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})
d9b4154c 703
704 if universal_data := self._get_universal_data(webpage, video_id):
705 self.write_debug('Found universal data for rehydration')
706 status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
707 video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
708
709 elif sigi_data := self._get_sigi_state(webpage, video_id):
710 self.write_debug('Found sigi state data')
711 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
712 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
713
714 elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'):
715 self.write_debug('Found next.js data')
716 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
717 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
718
11aa91a1 719 else:
d9b4154c 720 raise ExtractorError('Unable to extract webpage video data')
11aa91a1 721
d9b4154c 722 if video_data and status == 0:
92593690 723 return self._parse_aweme_video_web(video_data, url, video_id)
1418a043 724 elif status == 10216:
725 raise ExtractorError('This video is private', expected=True)
d9b4154c 726 raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
f7f18f90
A
727
728
0fd6661e 729class TikTokUserIE(TikTokBaseIE):
f7f18f90 730 IE_NAME = 'tiktok:user'
0fd6661e 731 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])'
f7c5a5e9 732 _WORKING = False
f7f18f90 733 _TESTS = [{
526d74ec 734 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
735 'playlist_mincount': 45,
736 'info_dict': {
737 'id': '6935371178089399301',
0481e266 738 'title': 'corgibobaa',
b3187433 739 'thumbnail': r're:https://.+_1080x1080\.webp'
f7f18f90 740 },
0481e266 741 'expected_warnings': ['Retrying']
5fa3c9a8
HTL
742 }, {
743 'url': 'https://www.tiktok.com/@6820838815978423302',
744 'playlist_mincount': 5,
745 'info_dict': {
746 'id': '6820838815978423302',
747 'title': '6820838815978423302',
748 'thumbnail': r're:https://.+_1080x1080\.webp'
749 },
750 'expected_warnings': ['Retrying']
f7f18f90
A
751 }, {
752 'url': 'https://www.tiktok.com/@meme',
753 'playlist_mincount': 593,
754 'info_dict': {
755 'id': '79005827461758976',
0481e266 756 'title': 'meme',
b3187433 757 'thumbnail': r're:https://.+_1080x1080\.webp'
f7f18f90 758 },
0481e266 759 'expected_warnings': ['Retrying']
f7f18f90
A
760 }]
761
0fd6661e
M
762 r''' # TODO: Fix by adding _signature to api_url
763 def _entries(self, webpage, user_id, username):
764 secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, username)
f7f18f90
A
765 verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id')
766 if not verifyfp_cookie:
767 raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True)
768 api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor='
769 cursor = '0'
770 for page in itertools.count():
0fd6661e 771 data_json = self._download_json(api_url + cursor, username, note='Downloading Page %d' % page)
f7f18f90
A
772 for video in data_json.get('itemList', []):
773 video_id = video['id']
774 video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}'
bd9ff55b 775 yield self._url_result(video_url, 'TikTok', video_id, str_or_none(video.get('desc')))
526d74ec 776 if not data_json.get('hasMore'):
f7f18f90
A
777 break
778 cursor = data_json['cursor']
0fd6661e
M
779 '''
780
b3187433 781 def _video_entries_api(self, webpage, user_id, username):
0fd6661e
M
782 query = {
783 'user_id': user_id,
784 'count': 21,
785 'max_cursor': 0,
786 'min_cursor': 0,
787 'retry_type': 'no_retry',
efa944f4 788 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
0fd6661e
M
789 }
790
0fd6661e 791 for page in itertools.count(1):
be5c1ae8 792 for retry in self.RetryManager():
0fd6661e 793 try:
be5c1ae8 794 post_list = self._call_api(
795 'aweme/post', query, username, note=f'Downloading user video list page {page}',
796 errnote='Unable to download user video list')
0fd6661e 797 except ExtractorError as e:
be5c1ae8 798 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
799 retry.error = e
0fd6661e
M
800 continue
801 raise
b3187433 802 yield from post_list.get('aweme_list', [])
0fd6661e
M
803 if not post_list.get('has_more'):
804 break
805 query['max_cursor'] = post_list['max_cursor']
f7f18f90 806
b3187433 807 def _entries_api(self, user_id, videos):
808 for video in videos:
809 yield {
810 **self._parse_aweme_video_app(video),
811 'extractor_key': TikTokIE.ie_key(),
812 'extractor': 'TikTok',
813 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
814 }
815
f7f18f90 816 def _real_extract(self, url):
0481e266 817 user_name = self._match_id(url)
818 webpage = self._download_webpage(url, user_name, headers={
0fd6661e
M
819 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
820 })
5fa3c9a8 821 user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name
b3187433 822
823 videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
824 thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
825
826 return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail)
943d5ab1
M
827
828
6368e2e6 829class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
8126298c
M
830 def _entries(self, list_id, display_id):
831 query = {
832 self._QUERY_NAME: list_id,
833 'cursor': 0,
834 'count': 20,
835 'type': 5,
efa944f4 836 'device_id': ''.join(random.choices(string.digits, k=19))
8126298c
M
837 }
838
8126298c 839 for page in itertools.count(1):
be5c1ae8 840 for retry in self.RetryManager():
8126298c 841 try:
be5c1ae8 842 post_list = self._call_api(
843 self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}',
844 errnote='Unable to download video list')
8126298c 845 except ExtractorError as e:
be5c1ae8 846 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
847 retry.error = e
8126298c
M
848 continue
849 raise
8126298c
M
850 for video in post_list.get('aweme_list', []):
851 yield {
852 **self._parse_aweme_video_app(video),
0b77924a 853 'extractor_key': TikTokIE.ie_key(),
8126298c
M
854 'extractor': 'TikTok',
855 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
856 }
857 if not post_list.get('has_more'):
858 break
859 query['cursor'] = post_list['cursor']
860
861 def _real_extract(self, url):
862 list_id = self._match_id(url)
863 return self.playlist_result(self._entries(list_id, list_id), list_id)
864
865
866class TikTokSoundIE(TikTokBaseListIE):
867 IE_NAME = 'tiktok:sound'
868 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 869 _WORKING = False
8126298c
M
870 _QUERY_NAME = 'music_id'
871 _API_ENDPOINT = 'music/aweme'
872 _TESTS = [{
873 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
874 'playlist_mincount': 100,
875 'info_dict': {
876 'id': '6956990112127585029'
877 },
878 'expected_warnings': ['Retrying']
879 }, {
880 # Actual entries are less than listed video count
881 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
882 'playlist_mincount': 2182,
883 'info_dict': {
884 'id': '7036843036118469381'
885 },
886 'expected_warnings': ['Retrying']
887 }]
888
889
890class TikTokEffectIE(TikTokBaseListIE):
891 IE_NAME = 'tiktok:effect'
892 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 893 _WORKING = False
8126298c
M
894 _QUERY_NAME = 'sticker_id'
895 _API_ENDPOINT = 'sticker/aweme'
896 _TESTS = [{
897 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
898 'playlist_mincount': 100,
899 'info_dict': {
900 'id': '1258156',
901 },
902 'expected_warnings': ['Retrying']
903 }, {
904 # Different entries between mobile and web, depending on region
905 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
906 'only_matching': True
907 }]
908
909
910class TikTokTagIE(TikTokBaseListIE):
911 IE_NAME = 'tiktok:tag'
912 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
f7c5a5e9 913 _WORKING = False
8126298c
M
914 _QUERY_NAME = 'ch_id'
915 _API_ENDPOINT = 'challenge/aweme'
916 _TESTS = [{
917 'url': 'https://tiktok.com/tag/hello2018',
918 'playlist_mincount': 39,
919 'info_dict': {
920 'id': '46294678',
921 'title': 'hello2018',
922 },
923 'expected_warnings': ['Retrying']
924 }, {
925 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
926 'only_matching': True
927 }]
928
929 def _real_extract(self, url):
930 display_id = self._match_id(url)
931 webpage = self._download_webpage(url, display_id, headers={
932 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
933 })
934 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
935 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
936
937
ba723997 938class DouyinIE(TikTokBaseIE):
943d5ab1
M
939 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
940 _TESTS = [{
941 'url': 'https://www.douyin.com/video/6961737553342991651',
9ff94664 942 'md5': '9ecce7bc5b302601018ecb2871c63a75',
943d5ab1
M
943 'info_dict': {
944 'id': '6961737553342991651',
945 'ext': 'mp4',
946 'title': '#杨超越 小小水手带你去远航❤️',
ba723997 947 'description': '#杨超越 小小水手带你去远航❤️',
9ff94664 948 'uploader': '6897520xka',
943d5ab1 949 'uploader_id': '110403406559',
ba723997 950 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 951 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
ba723997 952 'creator': '杨超越',
9ff94664 953 'creators': ['杨超越'],
954 'duration': 19,
ba723997 955 'timestamp': 1620905839,
956 'upload_date': '20210513',
957 'track': '@杨超越创作的原声',
9ff94664 958 'artists': ['杨超越'],
943d5ab1
M
959 'view_count': int,
960 'like_count': int,
961 'repost_count': int,
962 'comment_count': int,
92593690 963 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 964 },
943d5ab1
M
965 }, {
966 'url': 'https://www.douyin.com/video/6982497745948921092',
9ff94664 967 'md5': '15c5e660b7048af3707304e3cc02bbb5',
943d5ab1
M
968 'info_dict': {
969 'id': '6982497745948921092',
970 'ext': 'mp4',
971 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
ba723997 972 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
9ff94664 973 'uploader': '0731chaoyue',
943d5ab1 974 'uploader_id': '408654318141572',
ba723997 975 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
92593690 976 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
ba723997 977 'creator': '杨超越工作室',
9ff94664 978 'creators': ['杨超越工作室'],
979 'duration': 42,
ba723997 980 'timestamp': 1625739481,
981 'upload_date': '20210708',
982 'track': '@杨超越工作室创作的原声',
9ff94664 983 'artists': ['杨超越工作室'],
943d5ab1
M
984 'view_count': int,
985 'like_count': int,
986 'repost_count': int,
987 'comment_count': int,
92593690 988 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 989 },
943d5ab1
M
990 }, {
991 'url': 'https://www.douyin.com/video/6953975910773099811',
9ff94664 992 'md5': '0e6443758b8355db9a3c34864a4276be',
943d5ab1
M
993 'info_dict': {
994 'id': '6953975910773099811',
995 'ext': 'mp4',
996 'title': '#一起看海 出现在你的夏日里',
ba723997 997 'description': '#一起看海 出现在你的夏日里',
9ff94664 998 'uploader': '6897520xka',
943d5ab1 999 'uploader_id': '110403406559',
ba723997 1000 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1001 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
ba723997 1002 'creator': '杨超越',
9ff94664 1003 'creators': ['杨超越'],
1004 'duration': 17,
ba723997 1005 'timestamp': 1619098692,
1006 'upload_date': '20210422',
1007 'track': '@杨超越创作的原声',
9ff94664 1008 'artists': ['杨超越'],
943d5ab1
M
1009 'view_count': int,
1010 'like_count': int,
1011 'repost_count': int,
1012 'comment_count': int,
92593690 1013 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1014 },
943d5ab1
M
1015 }, {
1016 'url': 'https://www.douyin.com/video/6950251282489675042',
1017 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
1018 'info_dict': {
1019 'id': '6950251282489675042',
1020 'ext': 'mp4',
1021 'title': '哈哈哈,成功了哈哈哈哈哈哈',
1022 'uploader': '杨超越',
1023 'upload_date': '20210412',
1024 'timestamp': 1618231483,
1025 'uploader_id': '110403406559',
1026 'view_count': int,
1027 'like_count': int,
1028 'repost_count': int,
1029 'comment_count': int,
ba723997 1030 },
1031 'skip': 'No longer available',
943d5ab1
M
1032 }, {
1033 'url': 'https://www.douyin.com/video/6963263655114722595',
9ff94664 1034 'md5': '1440bcf59d8700f8e014da073a4dfea8',
943d5ab1
M
1035 'info_dict': {
1036 'id': '6963263655114722595',
1037 'ext': 'mp4',
1038 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
ba723997 1039 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
9ff94664 1040 'uploader': '6897520xka',
943d5ab1 1041 'uploader_id': '110403406559',
ba723997 1042 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1043 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
ba723997 1044 'creator': '杨超越',
9ff94664 1045 'creators': ['杨超越'],
1046 'duration': 15,
ba723997 1047 'timestamp': 1621261163,
1048 'upload_date': '20210517',
1049 'track': '@杨超越创作的原声',
9ff94664 1050 'artists': ['杨超越'],
943d5ab1
M
1051 'view_count': int,
1052 'like_count': int,
1053 'repost_count': int,
1054 'comment_count': int,
92593690 1055 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1056 },
943d5ab1 1057 }]
943d5ab1 1058 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
53dad39e 1059 _WEBPAGE_HOST = 'https://www.douyin.com/'
943d5ab1
M
1060
1061 def _real_extract(self, url):
1062 video_id = self._match_id(url)
1063
9ff94664 1064 detail = traverse_obj(self._download_json(
1065 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
1066 'Downloading web detail JSON', 'Failed to download web detail JSON',
1067 query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
1068 if not detail:
943d5ab1 1069 # TODO: Run verification challenge code to generate signature cookies
ba723997 1070 raise ExtractorError(
9ff94664 1071 'Fresh cookies (not necessarily logged in) are needed',
1072 expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
943d5ab1 1073
9ff94664 1074 return self._parse_aweme_video_app(detail)
88afe056 1075
1076
49895f06 1077class TikTokVMIE(InfoExtractor):
ba723997 1078 _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
88afe056 1079 IE_NAME = 'vm.tiktok'
1080
49895f06 1081 _TESTS = [{
ba723997 1082 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
49895f06 1083 'info_dict': {
ba723997 1084 'id': '7170520270497680683',
49895f06 1085 'ext': 'mp4',
ba723997 1086 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
1087 'uploader_id': '6687535061741700102',
1088 'upload_date': '20221127',
49895f06 1089 'view_count': int,
ba723997 1090 'like_count': int,
49895f06 1091 'comment_count': int,
ba723997 1092 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
1093 'album': 'Wave of Mutilation: Best of Pixies',
1094 'thumbnail': r're:https://.+\.webp.*',
1095 'duration': 5,
1096 'timestamp': 1669516858,
49895f06 1097 'repost_count': int,
ba723997 1098 'artist': 'Pixies',
1099 'track': 'Where Is My Mind?',
1100 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
1101 'uploader': 'sigmachaddeus',
1102 'creator': 'SigmaChad',
1103 },
1104 }, {
c4cbd3be 1105 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
1106 'info_dict': {
1107 'id': '7106798200794926362',
1108 'ext': 'mp4',
1109 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
1110 'uploader_id': '6997695878846268418',
1111 'upload_date': '20220608',
1112 'view_count': int,
1113 'like_count': int,
1114 'comment_count': int,
1115 'thumbnail': r're:https://.+\.webp.*',
1116 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
1117 'duration': 29,
1118 'timestamp': 1654680400,
1119 'repost_count': int,
1120 'artist': 'Akihitoko',
1121 'track': 'original sound',
1122 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
1123 'uploader': 'akihitoko1',
1124 'creator': 'Akihitoko',
1125 },
49895f06 1126 }, {
1127 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
1128 'only_matching': True,
1129 }]
1130
88afe056 1131 def _real_extract(self, url):
11e1c2e3 1132 new_url = self._request_webpage(
3d2623a8 1133 HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
11e1c2e3 1134 if self.suitable(new_url): # Prevent infinite loop in case redirect fails
1135 raise UnsupportedError(new_url)
1136 return self.url_result(new_url)
933ed882
JC
1137
1138
216bcb66 1139class TikTokLiveIE(TikTokBaseIE):
1140 _VALID_URL = r'''(?x)https?://(?:
1141 (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
1142 m\.tiktok\.com/share/live/(?P<id>\d+)
1143 )'''
933ed882
JC
1144 IE_NAME = 'tiktok:live'
1145
1146 _TESTS = [{
216bcb66 1147 'url': 'https://www.tiktok.com/@weathernewslive/live',
1148 'info_dict': {
1149 'id': '7210809319192726273',
1150 'ext': 'mp4',
1151 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
1152 'creator': 'ウェザーニュースLiVE',
1153 'uploader': 'weathernewslive',
1154 'uploader_id': '6621496731283095554',
1155 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
1156 'live_status': 'is_live',
1157 'concurrent_view_count': int,
1158 },
1159 'params': {'skip_download': 'm3u8'},
1160 }, {
1161 'url': 'https://www.tiktok.com/@pilarmagenta/live',
1162 'info_dict': {
1163 'id': '7209423610325322522',
1164 'ext': 'mp4',
1165 'title': str,
1166 'creator': 'Pilarmagenta',
1167 'uploader': 'pilarmagenta',
1168 'uploader_id': '6624846890674683909',
1169 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
1170 'live_status': 'is_live',
1171 'concurrent_view_count': int,
1172 },
1173 'skip': 'Livestream',
1174 }, {
1175 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
1176 'only_matching': True,
1177 }, {
933ed882
JC
1178 'url': 'https://www.tiktok.com/@iris04201/live',
1179 'only_matching': True,
1180 }]
1181
216bcb66 1182 def _call_api(self, url, param, room_id, uploader, key=None):
1183 response = traverse_obj(self._download_json(
1184 url, room_id, fatal=False, query={
1185 'aid': '1988',
1186 param: room_id,
1187 }), (key, {dict}), default={})
1188
1189 # status == 2 if live else 4
1190 if int_or_none(response.get('status')) == 2:
1191 return response
1192 # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
1193 elif not uploader:
1194 raise ExtractorError('This livestream has ended', expected=True)
1195 raise UserNotLive(video_id=uploader)
1196
933ed882 1197 def _real_extract(self, url):
216bcb66 1198 uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
1199 webpage = self._download_webpage(
1200 url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
1201
1202 if webpage:
d9b4154c 1203 data = self._get_sigi_state(webpage, uploader or room_id)
216bcb66 1204 room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
1205 or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
1206 or room_id)
1207 uploader = uploader or traverse_obj(
1208 data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
1209 ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
1210
933ed882
JC
1211 if not room_id:
1212 raise UserNotLive(video_id=uploader)
933ed882 1213
216bcb66 1214 formats = []
1215 live_info = self._call_api(
1216 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
1217
1218 get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
1219 parse_inner = lambda x: self._parse_json(x, None)
1220
1221 for quality, stream in traverse_obj(live_info, (
1222 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
1223 {parse_inner}, 'data', {dict}), default={}).items():
1224
1225 sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
1226 'vcodec': ('VCodec', {str}),
1227 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
1228 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
1229 }))
1230
1231 flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
1232 if flv_url:
1233 formats.append({
1234 'url': flv_url,
1235 'ext': 'flv',
1236 'format_id': f'flv-{quality}',
1237 'quality': get_quality(quality),
1238 **sdk_params,
1239 })
1240
1241 hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
1242 if hls_url:
1243 formats.append({
1244 'url': hls_url,
1245 'ext': 'mp4',
1246 'protocol': 'm3u8_native',
1247 'format_id': f'hls-{quality}',
1248 'quality': get_quality(quality),
1249 **sdk_params,
1250 })
1251
1252 def get_vcodec(*keys):
1253 return traverse_obj(live_info, (
1254 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
1255
1256 for stream in ('hls', 'rtmp'):
1257 stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
1258 if stream_url:
1259 formats.append({
1260 'url': stream_url,
1261 'ext': 'mp4' if stream == 'hls' else 'flv',
1262 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
1263 'format_id': f'{stream}-pull',
1264 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
1265 'quality': get_quality('ORIGION'),
1266 })
1267
1268 for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
1269 if not url_or_none(f_url):
1270 continue
1271 formats.append({
1272 'url': f_url,
1273 'ext': 'flv',
1274 'format_id': f'flv-{f_id}'.lower(),
1275 'vcodec': get_vcodec('flv_pull_url_params', f_id),
1276 'quality': get_quality(f_id),
1277 })
1278
1279 # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
1280 if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
1281 live_info = merge_dicts(live_info, self._call_api(
1282 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
1283 if url_or_none(live_info.get('liveUrl')):
1284 formats.append({
1285 'url': live_info['liveUrl'],
1286 'ext': 'mp4',
1287 'protocol': 'm3u8_native',
1288 'format_id': 'hls-fallback',
1289 'vcodec': 'h264',
1290 'quality': get_quality('origin'),
1291 })
1292
1293 uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
933ed882
JC
1294
1295 return {
1296 'id': room_id,
933ed882 1297 'uploader': uploader,
216bcb66 1298 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
933ed882 1299 'is_live': True,
216bcb66 1300 'formats': formats,
1301 '_format_sort_fields': ('quality', 'ext'),
1302 **traverse_obj(live_info, {
1303 'title': 'title',
1304 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
1305 'creator': (('ownerInfo', 'owner'), 'nickname'),
1306 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
1307 }, get_all=False),
933ed882 1308 }