]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/tiktok.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
... / ...
CommitLineData
1import functools
2import itertools
3import json
4import random
5import re
6import string
7import time
8import urllib.parse
9import uuid
10
11from .common import InfoExtractor
12from ..networking import HEADRequest
13from ..utils import (
14 ExtractorError,
15 UnsupportedError,
16 UserNotLive,
17 determine_ext,
18 filter_dict,
19 format_field,
20 int_or_none,
21 join_nonempty,
22 merge_dicts,
23 mimetype2ext,
24 parse_qs,
25 qualities,
26 remove_start,
27 srt_subtitles_timecode,
28 str_or_none,
29 traverse_obj,
30 try_call,
31 try_get,
32 url_or_none,
33)
34
35
36class TikTokBaseIE(InfoExtractor):
37 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
38 _WEBPAGE_HOST = 'https://www.tiktok.com/'
39 QUALITIES = ('360p', '540p', '720p', '1080p')
40
41 _APP_INFO_DEFAULTS = {
42 # unique "install id"
43 'iid': None,
44 # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
45 'app_name': 'musical_ly',
46 'app_version': '34.1.2',
47 'manifest_app_version': '2023401020',
48 # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
49 'aid': '0',
50 }
51 _APP_INFO_POOL = None
52 _APP_INFO = None
53 _APP_USER_AGENT = None
54
55 @functools.cached_property
56 def _KNOWN_APP_INFO(self):
57 # If we have a genuine device ID, we may not need any IID
58 default = [''] if self._KNOWN_DEVICE_ID else []
59 return self._configuration_arg('app_info', default, ie_key=TikTokIE)
60
61 @functools.cached_property
62 def _KNOWN_DEVICE_ID(self):
63 return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0]
64
65 @functools.cached_property
66 def _DEVICE_ID(self):
67 return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000))
68
69 @functools.cached_property
70 def _API_HOSTNAME(self):
71 return self._configuration_arg(
72 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
73
74 def _get_next_app_info(self):
75 if self._APP_INFO_POOL is None:
76 defaults = {
77 key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
78 for key, default in self._APP_INFO_DEFAULTS.items()
79 if key != 'iid'
80 }
81 self._APP_INFO_POOL = [
82 {**defaults, **dict(
83 (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
84 )} for app_info in self._KNOWN_APP_INFO
85 ]
86
87 if not self._APP_INFO_POOL:
88 return False
89
90 self._APP_INFO = self._APP_INFO_POOL.pop(0)
91
92 app_name = self._APP_INFO['app_name']
93 version = self._APP_INFO['manifest_app_version']
94 if app_name == 'musical_ly':
95 package = f'com.zhiliaoapp.musically/{version}'
96 else: # trill, aweme
97 package = f'com.ss.android.ugc.{app_name}/{version}'
98 self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
99
100 return True
101
102 @staticmethod
103 def _create_url(user_id, video_id):
104 return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
105
106 def _get_sigi_state(self, webpage, display_id):
107 return self._search_json(
108 r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
109 'sigi state', display_id, end_pattern=r'</script>', default={})
110
111 def _get_universal_data(self, webpage, display_id):
112 return traverse_obj(self._search_json(
113 r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
114 'universal data', display_id, end_pattern=r'</script>', default={}),
115 ('__DEFAULT_SCOPE__', {dict})) or {}
116
117 def _call_api_impl(self, ep, query, video_id, fatal=True,
118 note='Downloading API JSON', errnote='Unable to download API page'):
119 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
120 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
121 if webpage_cookies.get('sid_tt'):
122 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
123 return self._download_json(
124 f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id,
125 fatal=fatal, note=note, errnote=errnote, headers={
126 'User-Agent': self._APP_USER_AGENT,
127 'Accept': 'application/json',
128 }, query=query)
129
130 def _build_api_query(self, query):
131 return filter_dict({
132 **query,
133 'device_platform': 'android',
134 'os': 'android',
135 'ssmix': 'a',
136 '_rticket': int(time.time() * 1000),
137 'cdid': str(uuid.uuid4()),
138 'channel': 'googleplay',
139 'aid': self._APP_INFO['aid'],
140 'app_name': self._APP_INFO['app_name'],
141 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')),
142 'version_name': self._APP_INFO['app_version'],
143 'manifest_version_code': self._APP_INFO['manifest_app_version'],
144 'update_version_code': self._APP_INFO['manifest_app_version'],
145 'ab_version': self._APP_INFO['app_version'],
146 'resolution': '1080*2400',
147 'dpi': 420,
148 'device_type': 'Pixel 7',
149 'device_brand': 'Google',
150 'language': 'en',
151 'os_api': '29',
152 'os_version': '13',
153 'ac': 'wifi',
154 'is_pad': '0',
155 'current_region': 'US',
156 'app_type': 'normal',
157 'sys_region': 'US',
158 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
159 'timezone_name': 'America/New_York',
160 'residence': 'US',
161 'app_language': 'en',
162 'timezone_offset': '-14400',
163 'host_abi': 'armeabi-v7a',
164 'locale': 'en',
165 'ac2': 'wifi5g',
166 'uoo': '1',
167 'carrier_region': 'US',
168 'op_region': 'US',
169 'build_number': self._APP_INFO['app_version'],
170 'region': 'US',
171 'ts': int(time.time()),
172 'iid': self._APP_INFO.get('iid'),
173 'device_id': self._DEVICE_ID,
174 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
175 })
176
177 def _call_api(self, ep, query, video_id, fatal=True,
178 note='Downloading API JSON', errnote='Unable to download API page'):
179 if not self._APP_INFO and not self._get_next_app_info():
180 message = 'No working app info is available'
181 if fatal:
182 raise ExtractorError(message, expected=True)
183 else:
184 self.report_warning(message)
185 return
186
187 max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
188 for count in itertools.count(1):
189 self.write_debug(str(self._APP_INFO))
190 real_query = self._build_api_query(query)
191 try:
192 return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
193 except ExtractorError as e:
194 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
195 message = str(e.cause or e.msg)
196 if not self._get_next_app_info():
197 if fatal:
198 raise
199 else:
200 self.report_warning(message)
201 return
202 self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
203 continue
204 raise
205
206 def _extract_aweme_app(self, aweme_id):
207 feed_list = self._call_api(
208 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed',
209 errnote='Unable to download video feed').get('aweme_list') or []
210 aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
211 if not aweme_detail:
212 raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
213 return self._parse_aweme_video_app(aweme_detail)
214
215 def _extract_web_data_and_status(self, url, video_id, fatal=True):
216 webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or ''
217 video_data, status = {}, None
218
219 if universal_data := self._get_universal_data(webpage, video_id):
220 self.write_debug('Found universal data for rehydration')
221 status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
222 video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
223
224 elif sigi_data := self._get_sigi_state(webpage, video_id):
225 self.write_debug('Found sigi state data')
226 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
227 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
228
229 elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
230 self.write_debug('Found next.js data')
231 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
232 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
233
234 elif fatal:
235 raise ExtractorError('Unable to extract webpage video data')
236
237 return video_data, status
238
239 def _get_subtitles(self, aweme_detail, aweme_id, user_name):
240 # TODO: Extract text positioning info
241 subtitles = {}
242 # aweme/detail endpoint subs
243 captions_info = traverse_obj(
244 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
245 for caption in captions_info:
246 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
247 if not caption_url:
248 continue
249 caption_json = self._download_json(
250 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
251 if not caption_json:
252 continue
253 subtitles.setdefault(caption.get('language', 'en'), []).append({
254 'ext': 'srt',
255 'data': '\n\n'.join(
256 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
257 for i, line in enumerate(caption_json['utterances']) if line.get('text')),
258 })
259 # feed endpoint subs
260 if not subtitles:
261 for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
262 if not caption.get('url'):
263 continue
264 subtitles.setdefault(caption.get('lang') or 'en', []).append({
265 'ext': remove_start(caption.get('caption_format'), 'web'),
266 'url': caption['url'],
267 })
268 # webpage subs
269 if not subtitles:
270 if user_name: # only _parse_aweme_video_app needs to extract the webpage here
271 aweme_detail, _ = self._extract_web_data_and_status(
272 self._create_url(user_name, aweme_id), aweme_id, fatal=False)
273 for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
274 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
275 'ext': remove_start(caption.get('Format'), 'web'),
276 'url': caption['Url'],
277 })
278 return subtitles
279
280 def _parse_url_key(self, url_key):
281 format_id, codec, res, bitrate = self._search_regex(
282 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
283 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
284 if not format_id:
285 return {}, None
286 return {
287 'format_id': format_id,
288 'vcodec': 'h265' if codec == 'bytevc1' else codec,
289 'tbr': int_or_none(bitrate, scale=1000) or None,
290 'quality': qualities(self.QUALITIES)(res),
291 }, res
292
293 def _parse_aweme_video_app(self, aweme_detail):
294 aweme_id = aweme_detail['aweme_id']
295 video_info = aweme_detail['video']
296 known_resolutions = {}
297
298 def audio_meta(url):
299 ext = determine_ext(url, default_ext='m4a')
300 return {
301 'format_note': 'Music track',
302 'ext': ext,
303 'acodec': 'aac' if ext == 'm4a' else ext,
304 'vcodec': 'none',
305 'width': None,
306 'height': None,
307 } if ext == 'mp3' or '-music-' in url else {}
308
309 def extract_addr(addr, add_meta={}):
310 parsed_meta, res = self._parse_url_key(addr.get('url_key', ''))
311 is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
312 if res:
313 known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
314 known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
315 parsed_meta.update(known_resolutions.get(res, {}))
316 add_meta.setdefault('height', int_or_none(res[:-1]))
317 return [{
318 'url': url,
319 'filesize': int_or_none(addr.get('data_size')),
320 'ext': 'mp4',
321 'acodec': 'aac',
322 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
323 **add_meta, **parsed_meta,
324 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
325 'preference': -100 if is_bytevc2 else -1,
326 'format_note': join_nonempty(
327 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
328 '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
329 **audio_meta(url),
330 } for url in addr.get('url_list') or []]
331
332 # Hack: Add direct video links first to prioritize them when removing duplicate formats
333 formats = []
334 width = int_or_none(video_info.get('width'))
335 height = int_or_none(video_info.get('height'))
336 ratio = try_call(lambda: width / height) or 0.5625
337 if video_info.get('play_addr'):
338 formats.extend(extract_addr(video_info['play_addr'], {
339 'format_id': 'play_addr',
340 'format_note': 'Direct video',
341 'vcodec': 'h265' if traverse_obj(
342 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
343 'width': width,
344 'height': height,
345 }))
346 if video_info.get('download_addr'):
347 download_addr = video_info['download_addr']
348 dl_width = int_or_none(download_addr.get('width'))
349 formats.extend(extract_addr(download_addr, {
350 'format_id': 'download_addr',
351 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
352 'vcodec': 'h264',
353 'width': dl_width,
354 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong
355 'preference': -2 if video_info.get('has_watermark') else -1,
356 }))
357 if video_info.get('play_addr_h264'):
358 formats.extend(extract_addr(video_info['play_addr_h264'], {
359 'format_id': 'play_addr_h264',
360 'format_note': 'Direct video',
361 'vcodec': 'h264',
362 }))
363 if video_info.get('play_addr_bytevc1'):
364 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
365 'format_id': 'play_addr_bytevc1',
366 'format_note': 'Direct video',
367 'vcodec': 'h265',
368 }))
369
370 for bitrate in video_info.get('bit_rate', []):
371 if bitrate.get('play_addr'):
372 formats.extend(extract_addr(bitrate['play_addr'], {
373 'format_id': bitrate.get('gear_name'),
374 'format_note': 'Playback video',
375 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
376 'vcodec': 'h265' if traverse_obj(
377 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
378 'fps': bitrate.get('FPS'),
379 }))
380
381 self._remove_duplicate_formats(formats)
382 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
383 if auth_cookie:
384 for f in formats:
385 self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
386
387 thumbnails = []
388 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
389 'origin_cover', 'dynamic_cover'):
390 for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
391 thumbnails.append({
392 'id': cover_id,
393 'url': cover_url,
394 })
395
396 stats_info = aweme_detail.get('statistics') or {}
397 music_info = aweme_detail.get('music') or {}
398 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
399
400 contained_music_track = traverse_obj(
401 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
402 contained_music_author = traverse_obj(
403 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
404
405 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle'))
406 if is_generic_og_trackname:
407 music_track, music_author = contained_music_track or 'original sound', contained_music_author
408 else:
409 music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str}))
410
411 author_info = traverse_obj(aweme_detail, ('author', {
412 'uploader': ('unique_id', {str}),
413 'uploader_id': ('uid', {str_or_none}),
414 'channel': ('nickname', {str}),
415 'channel_id': ('sec_uid', {str}),
416 }))
417
418 return {
419 'id': aweme_id,
420 **traverse_obj(aweme_detail, {
421 'title': ('desc', {str}),
422 'description': ('desc', {str}),
423 'timestamp': ('create_time', {int_or_none}),
424 }),
425 **traverse_obj(stats_info, {
426 'view_count': 'play_count',
427 'like_count': 'digg_count',
428 'repost_count': 'share_count',
429 'comment_count': 'comment_count',
430 }, expected_type=int_or_none),
431 **author_info,
432 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
433 'uploader_url': format_field(
434 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
435 'track': music_track,
436 'album': str_or_none(music_info.get('album')) or None,
437 'artists': re.split(r'(?:, | & )', music_author) if music_author else None,
438 'formats': formats,
439 'subtitles': self.extract_subtitles(
440 aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')),
441 'thumbnails': thumbnails,
442 'duration': (traverse_obj(video_info, (
443 (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any))
444 or traverse_obj(music_info, ('duration', {int_or_none}))),
445 'availability': self._availability(
446 is_private='Private' in labels,
447 needs_subscription='Friends only' in labels,
448 is_unlisted='Followers only' in labels),
449 '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
450 }
451
452 def _extract_web_formats(self, aweme_detail):
453 COMMON_FORMAT_INFO = {
454 'ext': 'mp4',
455 'vcodec': 'h264',
456 'acodec': 'aac',
457 }
458 video_info = traverse_obj(aweme_detail, ('video', {dict})) or {}
459 play_width = int_or_none(video_info.get('width'))
460 play_height = int_or_none(video_info.get('height'))
461 ratio = try_call(lambda: play_width / play_height) or 0.5625
462 formats = []
463
464 for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])):
465 format_info, res = self._parse_url_key(
466 traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '')
467 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
468 is_bytevc2 = format_info.get('vcodec') == 'bytevc2'
469 format_info.update({
470 'format_note': 'UNPLAYABLE' if is_bytevc2 else None,
471 'preference': -100 if is_bytevc2 else -1,
472 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})),
473 })
474
475 if dimension := (res and int(res[:-1])):
476 if dimension == 540: # '540p' is actually 576p
477 dimension = 576
478 if ratio < 1: # portrait: res/dimension is width
479 y = int(dimension / ratio)
480 format_info.update({
481 'width': dimension,
482 'height': y - (y % 2),
483 })
484 else: # landscape: res/dimension is height
485 x = int(dimension * ratio)
486 format_info.update({
487 'width': x + (x % 2),
488 'height': dimension,
489 })
490
491 for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})):
492 formats.append({
493 **COMMON_FORMAT_INFO,
494 **format_info,
495 'url': self._proto_relative_url(video_url),
496 })
497
498 # We don't have res string for play formats, but need quality for sorting & de-duplication
499 play_quality = traverse_obj(formats, (lambda _, v: v['width'] == play_width, 'quality', any))
500
501 for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
502 formats.append({
503 **COMMON_FORMAT_INFO,
504 'format_id': 'play',
505 'url': self._proto_relative_url(play_url),
506 'width': play_width,
507 'height': play_height,
508 'quality': play_quality,
509 })
510
511 for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
512 formats.append({
513 **COMMON_FORMAT_INFO,
514 'format_id': 'download',
515 'url': self._proto_relative_url(download_url),
516 })
517
518 self._remove_duplicate_formats(formats)
519
520 for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']):
521 f.update({
522 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '),
523 'preference': f.get('preference') or -2,
524 })
525
526 # Is it a slideshow with only audio for download?
527 if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})):
528 audio_url = aweme_detail['music']['playUrl']
529 ext = traverse_obj(parse_qs(audio_url), (
530 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a'
531 formats.append({
532 'format_id': 'audio',
533 'url': self._proto_relative_url(audio_url),
534 'ext': ext,
535 'acodec': 'aac' if ext == 'm4a' else ext,
536 'vcodec': 'none',
537 })
538
539 return formats
540
541 def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False):
542 author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), {
543 'channel': ('nickname', {str}),
544 'channel_id': (('authorSecId', 'secUid'), {str}),
545 'uploader': (('uniqueId', 'author'), {str}),
546 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
547 }), get_all=False)
548
549 return {
550 'id': video_id,
551 'formats': None if extract_flat else self._extract_web_formats(aweme_detail),
552 'subtitles': None if extract_flat else self.extract_subtitles(aweme_detail, video_id, None),
553 'http_headers': {'Referer': webpage_url},
554 **author_info,
555 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
556 'uploader_url': format_field(
557 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
558 **traverse_obj(aweme_detail, ('music', {
559 'track': ('title', {str}),
560 'album': ('album', {str}, {lambda x: x or None}),
561 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}),
562 'duration': ('duration', {int_or_none}),
563 })),
564 **traverse_obj(aweme_detail, {
565 'title': ('desc', {str}),
566 'description': ('desc', {str}),
567 # audio-only slideshows have a video duration of 0 and an actual audio duration
568 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}),
569 'timestamp': ('createTime', {int_or_none}),
570 }),
571 **traverse_obj(aweme_detail, ('stats', {
572 'view_count': 'playCount',
573 'like_count': 'diggCount',
574 'repost_count': 'shareCount',
575 'comment_count': 'commentCount',
576 }), expected_type=int_or_none),
577 'thumbnails': traverse_obj(aweme_detail, (
578 (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {
579 'url': ({url_or_none}, {self._proto_relative_url}),
580 },
581 )),
582 }
583
584
585class TikTokIE(TikTokBaseIE):
586 _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
587 _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
588
589 _TESTS = [{
590 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
591 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
592 'info_dict': {
593 'id': '6748451240264420610',
594 'ext': 'mp4',
595 'title': '#jassmanak #lehanga #leenabhushan',
596 'description': '#jassmanak #lehanga #leenabhushan',
597 'duration': 13,
598 'height': 1024,
599 'width': 576,
600 'uploader': 'leenabhushan',
601 'uploader_id': '6691488002098119685',
602 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
603 'creator': 'facestoriesbyleenabh',
604 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
605 'upload_date': '20191016',
606 'timestamp': 1571246252,
607 'view_count': int,
608 'like_count': int,
609 'repost_count': int,
610 'comment_count': int,
611 'artist': 'Ysrbeats',
612 'album': 'Lehanga',
613 'track': 'Lehanga',
614 },
615 'skip': '404 Not Found',
616 }, {
617 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
618 'md5': 'f21112672ee4ce05ca390fb6522e1b6f',
619 'info_dict': {
620 'id': '6742501081818877190',
621 'ext': 'mp4',
622 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
623 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
624 'duration': 27,
625 'height': 1024,
626 'width': 576,
627 'uploader': 'patrox',
628 'uploader_id': '18702747',
629 'uploader_url': 'https://www.tiktok.com/@patrox',
630 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
631 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
632 'channel': 'patroX',
633 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
634 'upload_date': '20190930',
635 'timestamp': 1569860870,
636 'view_count': int,
637 'like_count': int,
638 'repost_count': int,
639 'comment_count': int,
640 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'],
641 'track': 'Big Fun',
642 },
643 }, {
644 # Banned audio, was available on the app, now works with web too
645 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
646 'info_dict': {
647 'id': '6984138651336838402',
648 'ext': 'mp4',
649 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
650 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
651 'uploader': 'barudakhb_',
652 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
653 'uploader_id': '6974687867511718913',
654 'uploader_url': 'https://www.tiktok.com/@barudakhb_',
655 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
656 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
657 'track': 'Boka Dance',
658 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'],
659 'timestamp': 1626121503,
660 'duration': 18,
661 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
662 'upload_date': '20210712',
663 'view_count': int,
664 'like_count': int,
665 'repost_count': int,
666 'comment_count': int,
667 },
668 }, {
669 # Sponsored video, only available with feed workaround
670 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
671 'info_dict': {
672 'id': '7042692929109986561',
673 'ext': 'mp4',
674 'title': 'Slap and Run!',
675 'description': 'Slap and Run!',
676 'uploader': 'user440922249',
677 'channel': 'Slap And Run',
678 'uploader_id': '7036055384943690754',
679 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
680 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
681 'track': 'Promoted Music',
682 'timestamp': 1639754738,
683 'duration': 30,
684 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
685 'upload_date': '20211217',
686 'view_count': int,
687 'like_count': int,
688 'repost_count': int,
689 'comment_count': int,
690 },
691 'skip': 'This video is unavailable',
692 }, {
693 # Video without title and description
694 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
695 'info_dict': {
696 'id': '7059698374567611694',
697 'ext': 'mp4',
698 'title': 'TikTok video #7059698374567611694',
699 'description': '',
700 'uploader': 'pokemonlife22',
701 'channel': 'Pokemon',
702 'uploader_id': '6820838815978423302',
703 'uploader_url': 'https://www.tiktok.com/@pokemonlife22',
704 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
705 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
706 'track': 'original sound',
707 'timestamp': 1643714123,
708 'duration': 6,
709 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
710 'upload_date': '20220201',
711 'artists': ['Pokemon'],
712 'view_count': int,
713 'like_count': int,
714 'repost_count': int,
715 'comment_count': int,
716 },
717 }, {
718 # hydration JSON is sent in a <script> element
719 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
720 'info_dict': {
721 'id': '7065799023130643713',
722 'ext': 'mp4',
723 'title': '#denidil#денидил',
724 'description': '#denidil#денидил',
725 'uploader': 'denidil6',
726 'uploader_id': '7046664115636405250',
727 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
728 'artist': 'Holocron Music',
729 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
730 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
731 'timestamp': 1645134536,
732 'duration': 26,
733 'upload_date': '20220217',
734 'view_count': int,
735 'like_count': int,
736 'repost_count': int,
737 'comment_count': int,
738 },
739 'skip': 'This video is unavailable',
740 }, {
741 # slideshow audio-only mp3 format
742 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
743 'info_dict': {
744 'id': '7139980461132074283',
745 'ext': 'mp3',
746 'title': 'TikTok video #7139980461132074283',
747 'description': '',
748 'channel': 'Antaura',
749 'uploader': '_le_cannibale_',
750 'uploader_id': '6604511138619654149',
751 'uploader_url': 'https://www.tiktok.com/@_le_cannibale_',
752 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
753 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
754 'artists': ['nathan !'],
755 'track': 'grahamscott canon',
756 'duration': 10,
757 'upload_date': '20220905',
758 'timestamp': 1662406249,
759 'view_count': int,
760 'like_count': int,
761 'repost_count': int,
762 'comment_count': int,
763 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
764 },
765 }, {
766 # only available via web
767 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
768 'md5': '4cdefa501ac8ac20bf04986e10916fea',
769 'info_dict': {
770 'id': '7206382937372134662',
771 'ext': 'mp4',
772 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
773 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
774 'channel': 'MoxyPatch',
775 'uploader': 'moxypatch',
776 'uploader_id': '7039142049363379205',
777 'uploader_url': 'https://www.tiktok.com/@moxypatch',
778 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
779 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
780 'artists': ['your worst nightmare'],
781 'track': 'original sound',
782 'upload_date': '20230303',
783 'timestamp': 1677866781,
784 'duration': 10,
785 'view_count': int,
786 'like_count': int,
787 'repost_count': int,
788 'comment_count': int,
789 'thumbnail': r're:^https://.+',
790 'thumbnails': 'count:3',
791 },
792 'expected_warnings': ['Unable to find video in feed'],
793 }, {
794 # 1080p format
795 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio
796 'md5': '982512017a8a917124d5a08c8ae79621',
797 'info_dict': {
798 'id': '7107337212743830830',
799 'ext': 'mp4',
800 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
801 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
802 'uploader': 'tatemcrae',
803 'uploader_id': '86328792343818240',
804 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
805 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
806 'channel': 'tate mcrae',
807 'artists': ['tate mcrae'],
808 'track': 'original sound',
809 'upload_date': '20220609',
810 'timestamp': 1654805899,
811 'duration': 150,
812 'view_count': int,
813 'like_count': int,
814 'repost_count': int,
815 'comment_count': int,
816 'thumbnail': r're:^https://.+\.webp',
817 },
818 'skip': 'Unavailable via feed API, only audio available via web',
819 }, {
820 # Slideshow, audio-only m4a format
821 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
822 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
823 'info_dict': {
824 'id': '7253412088251534594',
825 'ext': 'm4a',
826 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
827 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
828 'uploader': 'hara_yoimiya',
829 'uploader_id': '6582536342634676230',
830 'uploader_url': 'https://www.tiktok.com/@hara_yoimiya',
831 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
832 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
833 'channel': 'лампочка(!)',
834 'artists': ['Øneheart'],
835 'album': 'watching the stars',
836 'track': 'watching the stars',
837 'duration': 60,
838 'upload_date': '20230708',
839 'timestamp': 1688816612,
840 'view_count': int,
841 'like_count': int,
842 'comment_count': int,
843 'repost_count': int,
844 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
845 },
846 }, {
847 # Auto-captions available
848 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
849 'only_matching': True,
850 }]
851
852 def _real_extract(self, url):
853 video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
854
855 if self._KNOWN_APP_INFO:
856 try:
857 return self._extract_aweme_app(video_id)
858 except ExtractorError as e:
859 e.expected = True
860 self.report_warning(f'{e}; trying with webpage')
861
862 url = self._create_url(user_id, video_id)
863 video_data, status = self._extract_web_data_and_status(url, video_id)
864
865 if video_data and status == 0:
866 return self._parse_aweme_video_web(video_data, url, video_id)
867 elif status == 10216:
868 raise ExtractorError('This video is private', expected=True)
869 raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
870
871
872class TikTokUserIE(TikTokBaseIE):
873 IE_NAME = 'tiktok:user'
874 _VALID_URL = r'(?:tiktokuser:|https?://(?:www\.)?tiktok\.com/@)(?P<id>[\w.-]+)/?(?:$|[#?])'
875 _TESTS = [{
876 'url': 'https://tiktok.com/@corgibobaa?lang=en',
877 'playlist_mincount': 45,
878 'info_dict': {
879 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
880 'title': 'corgibobaa',
881 },
882 }, {
883 'url': 'https://www.tiktok.com/@6820838815978423302',
884 'playlist_mincount': 5,
885 'info_dict': {
886 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
887 'title': '6820838815978423302',
888 },
889 }, {
890 'url': 'https://www.tiktok.com/@meme',
891 'playlist_mincount': 593,
892 'info_dict': {
893 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
894 'title': 'meme',
895 },
896 }, {
897 'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
898 'playlist_mincount': 31,
899 'info_dict': {
900 'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
901 },
902 }]
903 _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
904 _API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
905
906 def _build_web_query(self, sec_uid, cursor):
907 return {
908 'aid': '1988',
909 'app_language': 'en',
910 'app_name': 'tiktok_web',
911 'browser_language': 'en-US',
912 'browser_name': 'Mozilla',
913 'browser_online': 'true',
914 'browser_platform': 'Win32',
915 'browser_version': '5.0 (Windows)',
916 'channel': 'tiktok_web',
917 'cookie_enabled': 'true',
918 'count': '15',
919 'cursor': cursor,
920 'device_id': self._DEVICE_ID,
921 'device_platform': 'web_pc',
922 'focus_state': 'true',
923 'from_page': 'user',
924 'history_len': '2',
925 'is_fullscreen': 'false',
926 'is_page_visible': 'true',
927 'language': 'en',
928 'os': 'windows',
929 'priority_region': '',
930 'referer': '',
931 'region': 'US',
932 'screen_height': '1080',
933 'screen_width': '1920',
934 'secUid': sec_uid,
935 'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
936 'tz_name': 'UTC',
937 'verifyFp': f'verify_{"".join(random.choices(string.hexdigits, k=7))}',
938 'webcast_language': 'en',
939 }
940
941 def _entries(self, sec_uid, user_name):
942 display_id = user_name or sec_uid
943 seen_ids = set()
944
945 cursor = int(time.time() * 1E3)
946 for page in itertools.count(1):
947 response = self._download_json(
948 self._API_BASE_URL, display_id, f'Downloading page {page}',
949 query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
950
951 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
952 video_id = video['id']
953 if video_id in seen_ids:
954 continue
955 seen_ids.add(video_id)
956 webpage_url = self._create_url(display_id, video_id)
957 yield self.url_result(
958 webpage_url, TikTokIE,
959 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
960
961 old_cursor = cursor
962 cursor = traverse_obj(
963 response, ('itemList', -1, 'createTime', {lambda x: int(x * 1E3)}))
964 if not cursor or old_cursor == cursor:
965 # User may not have posted within this ~1 week lookback, so manually adjust cursor
966 cursor = old_cursor - 7 * 86_400_000
967 # In case 'hasMorePrevious' is wrong, break if we have gone back before TikTok existed
968 if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
969 break
970
971 def _get_sec_uid(self, user_url, user_name, msg):
972 webpage = self._download_webpage(
973 user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
974 note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
975 return (traverse_obj(self._get_universal_data(webpage, user_name),
976 ('webapp.user-detail', 'userInfo', 'user', 'secUid', {str}))
977 or traverse_obj(self._get_sigi_state(webpage, user_name),
978 ('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid', {str}),
979 ('UserModule', 'users', ..., 'secUid', {str}, any)))
980
981 def _real_extract(self, url):
982 user_name, sec_uid = self._match_id(url), None
983 if mobj := re.fullmatch(r'MS4wLjABAAAA[\w-]{64}', user_name):
984 user_name, sec_uid = None, mobj.group(0)
985 else:
986 sec_uid = (self._get_sec_uid(self._UPLOADER_URL_FORMAT % user_name, user_name, 'user')
987 or self._get_sec_uid(self._UPLOADER_URL_FORMAT % f'{user_name}/live', user_name, 'live'))
988
989 if not sec_uid:
990 webpage = self._download_webpage(
991 f'https://www.tiktok.com/embed/@{user_name}', user_name,
992 note='Downloading user embed page', fatal=False) or ''
993 data = traverse_obj(self._search_json(
994 r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
995 webpage, 'data', user_name, default={}),
996 ('source', 'data', f'/embed/@{user_name}', {dict}))
997
998 for aweme_id in traverse_obj(data, ('videoList', ..., 'id', {str})):
999 webpage_url = self._create_url(user_name, aweme_id)
1000 video_data, _ = self._extract_web_data_and_status(webpage_url, aweme_id, fatal=False)
1001 sec_uid = self._parse_aweme_video_web(
1002 video_data, webpage_url, aweme_id, extract_flat=True).get('channel_id')
1003 if sec_uid:
1004 break
1005
1006 if not sec_uid:
1007 raise ExtractorError(
1008 'Unable to extract secondary user ID. If you are able to get the channel_id '
1009 'from a video posted by this user, try using "tiktokuser:channel_id" as the '
1010 'input URL (replacing `channel_id` with its actual value)', expected=True)
1011
1012 return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
1013
1014
1015class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
1016 def _entries(self, list_id, display_id):
1017 query = {
1018 self._QUERY_NAME: list_id,
1019 'cursor': 0,
1020 'count': 20,
1021 'type': 5,
1022 'device_id': self._DEVICE_ID,
1023 }
1024
1025 for page in itertools.count(1):
1026 for retry in self.RetryManager():
1027 try:
1028 post_list = self._call_api(
1029 self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}',
1030 errnote='Unable to download video list')
1031 except ExtractorError as e:
1032 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
1033 retry.error = e
1034 continue
1035 raise
1036 for video in post_list.get('aweme_list', []):
1037 yield {
1038 **self._parse_aweme_video_app(video),
1039 'extractor_key': TikTokIE.ie_key(),
1040 'extractor': 'TikTok',
1041 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
1042 }
1043 if not post_list.get('has_more'):
1044 break
1045 query['cursor'] = post_list['cursor']
1046
1047 def _real_extract(self, url):
1048 list_id = self._match_id(url)
1049 return self.playlist_result(self._entries(list_id, list_id), list_id)
1050
1051
1052class TikTokSoundIE(TikTokBaseListIE):
1053 IE_NAME = 'tiktok:sound'
1054 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
1055 _WORKING = False
1056 _QUERY_NAME = 'music_id'
1057 _API_ENDPOINT = 'music/aweme'
1058 _TESTS = [{
1059 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
1060 'playlist_mincount': 100,
1061 'info_dict': {
1062 'id': '6956990112127585029',
1063 },
1064 'expected_warnings': ['Retrying'],
1065 }, {
1066 # Actual entries are less than listed video count
1067 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
1068 'playlist_mincount': 2182,
1069 'info_dict': {
1070 'id': '7036843036118469381',
1071 },
1072 'expected_warnings': ['Retrying'],
1073 }]
1074
1075
1076class TikTokEffectIE(TikTokBaseListIE):
1077 IE_NAME = 'tiktok:effect'
1078 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
1079 _WORKING = False
1080 _QUERY_NAME = 'sticker_id'
1081 _API_ENDPOINT = 'sticker/aweme'
1082 _TESTS = [{
1083 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
1084 'playlist_mincount': 100,
1085 'info_dict': {
1086 'id': '1258156',
1087 },
1088 'expected_warnings': ['Retrying'],
1089 }, {
1090 # Different entries between mobile and web, depending on region
1091 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
1092 'only_matching': True,
1093 }]
1094
1095
1096class TikTokTagIE(TikTokBaseListIE):
1097 IE_NAME = 'tiktok:tag'
1098 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
1099 _WORKING = False
1100 _QUERY_NAME = 'ch_id'
1101 _API_ENDPOINT = 'challenge/aweme'
1102 _TESTS = [{
1103 'url': 'https://tiktok.com/tag/hello2018',
1104 'playlist_mincount': 39,
1105 'info_dict': {
1106 'id': '46294678',
1107 'title': 'hello2018',
1108 },
1109 'expected_warnings': ['Retrying'],
1110 }, {
1111 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
1112 'only_matching': True,
1113 }]
1114
1115 def _real_extract(self, url):
1116 display_id = self._match_id(url)
1117 webpage = self._download_webpage(url, display_id, headers={
1118 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
1119 })
1120 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
1121 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
1122
1123
1124class TikTokCollectionIE(TikTokBaseIE):
1125 IE_NAME = 'tiktok:collection'
1126 _VALID_URL = r'https?://www\.tiktok\.com/@(?P<user_id>[\w.-]+)/collection/(?P<title>[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)'
1127 _TESTS = [{
1128 # playlist should have exactly 9 videos
1129 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462',
1130 'info_dict': {
1131 'id': '7371330159376370462',
1132 'title': 'imanoreotwe-count-test',
1133 },
1134 'playlist_count': 9,
1135 }, {
1136 # tests returning multiple pages of a large collection
1137 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875',
1138 'info_dict': {
1139 'id': '7111887189571160875',
1140 'title': 'imanoreotwe-%F0%9F%98%82',
1141 },
1142 'playlist_mincount': 100,
1143 }]
1144 _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/'
1145 _PAGE_COUNT = 30
1146
1147 def _build_web_query(self, collection_id, cursor):
1148 return {
1149 'aid': '1988',
1150 'collectionId': collection_id,
1151 'count': self._PAGE_COUNT,
1152 'cursor': cursor,
1153 'sourceType': '113',
1154 }
1155
1156 def _entries(self, collection_id):
1157 cursor = 0
1158 for page in itertools.count(1):
1159 response = self._download_json(
1160 self._API_BASE_URL, collection_id, f'Downloading page {page}',
1161 query=self._build_web_query(collection_id, cursor))
1162
1163 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
1164 video_id = video['id']
1165 author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_'
1166 webpage_url = self._create_url(author, video_id)
1167 yield self.url_result(
1168 webpage_url, TikTokIE,
1169 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
1170
1171 if not traverse_obj(response, 'hasMore'):
1172 break
1173 cursor += self._PAGE_COUNT
1174
1175 def _real_extract(self, url):
1176 collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id')
1177
1178 return self.playlist_result(
1179 self._entries(collection_id), collection_id, '-'.join((user_name, title)))
1180
1181
1182class DouyinIE(TikTokBaseIE):
1183 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
1184 _TESTS = [{
1185 'url': 'https://www.douyin.com/video/6961737553342991651',
1186 'md5': '9ecce7bc5b302601018ecb2871c63a75',
1187 'info_dict': {
1188 'id': '6961737553342991651',
1189 'ext': 'mp4',
1190 'title': '#杨超越 小小水手带你去远航❤️',
1191 'description': '#杨超越 小小水手带你去远航❤️',
1192 'uploader': '6897520xka',
1193 'uploader_id': '110403406559',
1194 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1195 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1196 'channel': '杨超越',
1197 'duration': 19,
1198 'timestamp': 1620905839,
1199 'upload_date': '20210513',
1200 'track': '@杨超越创作的原声',
1201 'artists': ['杨超越'],
1202 'view_count': int,
1203 'like_count': int,
1204 'repost_count': int,
1205 'comment_count': int,
1206 'thumbnail': r're:https?://.+\.jpe?g',
1207 },
1208 }, {
1209 'url': 'https://www.douyin.com/video/6982497745948921092',
1210 'md5': '15c5e660b7048af3707304e3cc02bbb5',
1211 'info_dict': {
1212 'id': '6982497745948921092',
1213 'ext': 'mp4',
1214 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
1215 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
1216 'uploader': '0731chaoyue',
1217 'uploader_id': '408654318141572',
1218 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
1219 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
1220 'channel': '杨超越工作室',
1221 'duration': 42,
1222 'timestamp': 1625739481,
1223 'upload_date': '20210708',
1224 'track': '@杨超越工作室创作的原声',
1225 'artists': ['杨超越工作室'],
1226 'view_count': int,
1227 'like_count': int,
1228 'repost_count': int,
1229 'comment_count': int,
1230 'thumbnail': r're:https?://.+\.jpe?g',
1231 },
1232 }, {
1233 'url': 'https://www.douyin.com/video/6953975910773099811',
1234 'md5': '0e6443758b8355db9a3c34864a4276be',
1235 'info_dict': {
1236 'id': '6953975910773099811',
1237 'ext': 'mp4',
1238 'title': '#一起看海 出现在你的夏日里',
1239 'description': '#一起看海 出现在你的夏日里',
1240 'uploader': '6897520xka',
1241 'uploader_id': '110403406559',
1242 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1243 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1244 'channel': '杨超越',
1245 'duration': 17,
1246 'timestamp': 1619098692,
1247 'upload_date': '20210422',
1248 'track': '@杨超越创作的原声',
1249 'artists': ['杨超越'],
1250 'view_count': int,
1251 'like_count': int,
1252 'repost_count': int,
1253 'comment_count': int,
1254 'thumbnail': r're:https?://.+\.jpe?g',
1255 },
1256 }, {
1257 'url': 'https://www.douyin.com/video/6950251282489675042',
1258 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
1259 'info_dict': {
1260 'id': '6950251282489675042',
1261 'ext': 'mp4',
1262 'title': '哈哈哈,成功了哈哈哈哈哈哈',
1263 'uploader': '杨超越',
1264 'upload_date': '20210412',
1265 'timestamp': 1618231483,
1266 'uploader_id': '110403406559',
1267 'view_count': int,
1268 'like_count': int,
1269 'repost_count': int,
1270 'comment_count': int,
1271 },
1272 'skip': 'No longer available',
1273 }, {
1274 'url': 'https://www.douyin.com/video/6963263655114722595',
1275 'md5': '1440bcf59d8700f8e014da073a4dfea8',
1276 'info_dict': {
1277 'id': '6963263655114722595',
1278 'ext': 'mp4',
1279 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
1280 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
1281 'uploader': '6897520xka',
1282 'uploader_id': '110403406559',
1283 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1284 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
1285 'channel': '杨超越',
1286 'duration': 15,
1287 'timestamp': 1621261163,
1288 'upload_date': '20210517',
1289 'track': '@杨超越创作的原声',
1290 'artists': ['杨超越'],
1291 'view_count': int,
1292 'like_count': int,
1293 'repost_count': int,
1294 'comment_count': int,
1295 'thumbnail': r're:https?://.+\.jpe?g',
1296 },
1297 }]
1298 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
1299 _WEBPAGE_HOST = 'https://www.douyin.com/'
1300
1301 def _real_extract(self, url):
1302 video_id = self._match_id(url)
1303
1304 detail = traverse_obj(self._download_json(
1305 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
1306 'Downloading web detail JSON', 'Failed to download web detail JSON',
1307 query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
1308 if not detail:
1309 # TODO: Run verification challenge code to generate signature cookies
1310 raise ExtractorError(
1311 'Fresh cookies (not necessarily logged in) are needed',
1312 expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
1313
1314 return self._parse_aweme_video_app(detail)
1315
1316
1317class TikTokVMIE(InfoExtractor):
1318 _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
1319 IE_NAME = 'vm.tiktok'
1320
1321 _TESTS = [{
1322 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
1323 'info_dict': {
1324 'id': '7170520270497680683',
1325 'ext': 'mp4',
1326 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
1327 'uploader_id': '6687535061741700102',
1328 'upload_date': '20221127',
1329 'view_count': int,
1330 'like_count': int,
1331 'comment_count': int,
1332 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
1333 'album': 'Wave of Mutilation: Best of Pixies',
1334 'thumbnail': r're:https://.+\.webp.*',
1335 'duration': 5,
1336 'timestamp': 1669516858,
1337 'repost_count': int,
1338 'artist': 'Pixies',
1339 'track': 'Where Is My Mind?',
1340 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
1341 'uploader': 'sigmachaddeus',
1342 'creator': 'SigmaChad',
1343 },
1344 }, {
1345 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
1346 'info_dict': {
1347 'id': '7106798200794926362',
1348 'ext': 'mp4',
1349 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
1350 'uploader_id': '6997695878846268418',
1351 'upload_date': '20220608',
1352 'view_count': int,
1353 'like_count': int,
1354 'comment_count': int,
1355 'thumbnail': r're:https://.+\.webp.*',
1356 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
1357 'duration': 29,
1358 'timestamp': 1654680400,
1359 'repost_count': int,
1360 'artist': 'Akihitoko',
1361 'track': 'original sound',
1362 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
1363 'uploader': 'akihitoko1',
1364 'creator': 'Akihitoko',
1365 },
1366 }, {
1367 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
1368 'only_matching': True,
1369 }]
1370
1371 def _real_extract(self, url):
1372 new_url = self._request_webpage(
1373 HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
1374 if self.suitable(new_url): # Prevent infinite loop in case redirect fails
1375 raise UnsupportedError(new_url)
1376 return self.url_result(new_url)
1377
1378
1379class TikTokLiveIE(TikTokBaseIE):
1380 _VALID_URL = r'''(?x)https?://(?:
1381 (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
1382 m\.tiktok\.com/share/live/(?P<id>\d+)
1383 )'''
1384 IE_NAME = 'tiktok:live'
1385
1386 _TESTS = [{
1387 'url': 'https://www.tiktok.com/@weathernewslive/live',
1388 'info_dict': {
1389 'id': '7210809319192726273',
1390 'ext': 'mp4',
1391 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
1392 'creator': 'ウェザーニュースLiVE',
1393 'uploader': 'weathernewslive',
1394 'uploader_id': '6621496731283095554',
1395 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
1396 'live_status': 'is_live',
1397 'concurrent_view_count': int,
1398 },
1399 'params': {'skip_download': 'm3u8'},
1400 }, {
1401 'url': 'https://www.tiktok.com/@pilarmagenta/live',
1402 'info_dict': {
1403 'id': '7209423610325322522',
1404 'ext': 'mp4',
1405 'title': str,
1406 'creator': 'Pilarmagenta',
1407 'uploader': 'pilarmagenta',
1408 'uploader_id': '6624846890674683909',
1409 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
1410 'live_status': 'is_live',
1411 'concurrent_view_count': int,
1412 },
1413 'skip': 'Livestream',
1414 }, {
1415 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
1416 'only_matching': True,
1417 }, {
1418 'url': 'https://www.tiktok.com/@iris04201/live',
1419 'only_matching': True,
1420 }]
1421
1422 def _call_api(self, url, param, room_id, uploader, key=None):
1423 response = traverse_obj(self._download_json(
1424 url, room_id, fatal=False, query={
1425 'aid': '1988',
1426 param: room_id,
1427 }), (key, {dict}), default={})
1428
1429 # status == 2 if live else 4
1430 if int_or_none(response.get('status')) == 2:
1431 return response
1432 # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
1433 elif not uploader:
1434 raise ExtractorError('This livestream has ended', expected=True)
1435 raise UserNotLive(video_id=uploader)
1436
1437 def _real_extract(self, url):
1438 uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
1439 webpage = self._download_webpage(
1440 url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
1441
1442 if webpage:
1443 data = self._get_sigi_state(webpage, uploader or room_id)
1444 room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
1445 or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
1446 or room_id)
1447 uploader = uploader or traverse_obj(
1448 data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
1449 ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
1450
1451 if not room_id:
1452 raise UserNotLive(video_id=uploader)
1453
1454 formats = []
1455 live_info = self._call_api(
1456 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
1457
1458 get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
1459 parse_inner = lambda x: self._parse_json(x, None)
1460
1461 for quality, stream in traverse_obj(live_info, (
1462 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
1463 {parse_inner}, 'data', {dict}), default={}).items():
1464
1465 sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
1466 'vcodec': ('VCodec', {str}),
1467 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
1468 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
1469 }))
1470
1471 flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
1472 if flv_url:
1473 formats.append({
1474 'url': flv_url,
1475 'ext': 'flv',
1476 'format_id': f'flv-{quality}',
1477 'quality': get_quality(quality),
1478 **sdk_params,
1479 })
1480
1481 hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
1482 if hls_url:
1483 formats.append({
1484 'url': hls_url,
1485 'ext': 'mp4',
1486 'protocol': 'm3u8_native',
1487 'format_id': f'hls-{quality}',
1488 'quality': get_quality(quality),
1489 **sdk_params,
1490 })
1491
1492 def get_vcodec(*keys):
1493 return traverse_obj(live_info, (
1494 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
1495
1496 for stream in ('hls', 'rtmp'):
1497 stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
1498 if stream_url:
1499 formats.append({
1500 'url': stream_url,
1501 'ext': 'mp4' if stream == 'hls' else 'flv',
1502 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
1503 'format_id': f'{stream}-pull',
1504 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
1505 'quality': get_quality('ORIGION'),
1506 })
1507
1508 for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
1509 if not url_or_none(f_url):
1510 continue
1511 formats.append({
1512 'url': f_url,
1513 'ext': 'flv',
1514 'format_id': f'flv-{f_id}'.lower(),
1515 'vcodec': get_vcodec('flv_pull_url_params', f_id),
1516 'quality': get_quality(f_id),
1517 })
1518
1519 # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
1520 if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
1521 live_info = merge_dicts(live_info, self._call_api(
1522 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
1523 if url_or_none(live_info.get('liveUrl')):
1524 formats.append({
1525 'url': live_info['liveUrl'],
1526 'ext': 'mp4',
1527 'protocol': 'm3u8_native',
1528 'format_id': 'hls-fallback',
1529 'vcodec': 'h264',
1530 'quality': get_quality('origin'),
1531 })
1532
1533 uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
1534
1535 return {
1536 'id': room_id,
1537 'uploader': uploader,
1538 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
1539 'is_live': True,
1540 'formats': formats,
1541 '_format_sort_fields': ('quality', 'ext'),
1542 **traverse_obj(live_info, {
1543 'title': 'title',
1544 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
1545 'creator': (('ownerInfo', 'owner'), 'nickname'),
1546 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
1547 }, get_all=False),
1548 }