]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[ie/youtube] Suppress "Unavailable videos are hidden" warning (#10159)
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
3584b839 1import functools
f7f18f90 2import itertools
b801cd71 3import json
bd9ff55b 4import random
216bcb66 5import re
347f13dd 6import string
bd9ff55b 7import time
add96eb9 8import urllib.parse
cb61e20c 9import uuid
1ead840d
KS
10
11from .common import InfoExtractor
3d2623a8 12from ..networking import HEADRequest
1ead840d 13from ..utils import (
ce18a19b 14 ExtractorError,
11e1c2e3 15 UnsupportedError,
933ed882 16 UserNotLive,
8ceb07e8 17 determine_ext,
3584b839 18 filter_dict,
216bcb66 19 format_field,
1ead840d 20 int_or_none,
34921b43 21 join_nonempty,
216bcb66 22 merge_dicts,
4ccd73fe 23 mimetype2ext,
24 parse_qs,
b801cd71 25 qualities,
ba723997 26 remove_start,
e0585e65 27 srt_subtitles_timecode,
1ead840d 28 str_or_none,
bd9ff55b 29 traverse_obj,
216bcb66 30 try_call,
bd9ff55b 31 try_get,
943d5ab1 32 url_or_none,
96472d72 33 urlencode_postdata,
1ead840d
KS
34)
35
36
0fd6661e 37class TikTokBaseIE(InfoExtractor):
943d5ab1 38 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
53dad39e 39 _WEBPAGE_HOST = 'https://www.tiktok.com/'
be1f331f 40 QUALITIES = ('360p', '540p', '720p', '1080p')
ce18a19b 41
cb61e20c 42 _APP_INFO_DEFAULTS = {
43 # unique "install id"
44 'iid': None,
45 # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
46 'app_name': 'musical_ly',
96472d72 47 'app_version': '35.1.3',
48 'manifest_app_version': '2023501030',
cb61e20c 49 # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
50 'aid': '0',
51 }
cb61e20c 52 _APP_INFO_POOL = None
53 _APP_INFO = None
54 _APP_USER_AGENT = None
55
3584b839 56 @functools.cached_property
41ba4a80 57 def _KNOWN_APP_INFO(self):
3584b839 58 # If we have a genuine device ID, we may not need any IID
59 default = [''] if self._KNOWN_DEVICE_ID else []
60 return self._configuration_arg('app_info', default, ie_key=TikTokIE)
41ba4a80 61
3584b839 62 @functools.cached_property
63 def _KNOWN_DEVICE_ID(self):
64 return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0]
65
66 @functools.cached_property
67 def _DEVICE_ID(self):
68 return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000))
69
70 @functools.cached_property
c4cbd3be 71 def _API_HOSTNAME(self):
72 return self._configuration_arg(
41ba4a80 73 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
c4cbd3be 74
cb61e20c 75 def _get_next_app_info(self):
76 if self._APP_INFO_POOL is None:
77 defaults = {
78 key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
79 for key, default in self._APP_INFO_DEFAULTS.items()
80 if key != 'iid'
81 }
cb61e20c 82 self._APP_INFO_POOL = [
83 {**defaults, **dict(
84 (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
41ba4a80 85 )} for app_info in self._KNOWN_APP_INFO
cb61e20c 86 ]
87
88 if not self._APP_INFO_POOL:
89 return False
90
91 self._APP_INFO = self._APP_INFO_POOL.pop(0)
92
93 app_name = self._APP_INFO['app_name']
94 version = self._APP_INFO['manifest_app_version']
95 if app_name == 'musical_ly':
96 package = f'com.zhiliaoapp.musically/{version}'
97 else: # trill, aweme
98 package = f'com.ss.android.ugc.{app_name}/{version}'
99 self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
100
101 return True
102
b801cd71 103 @staticmethod
104 def _create_url(user_id, video_id):
105 return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
106
a39a7ba8 107 def _get_sigi_state(self, webpage, display_id):
069cbece 108 return self._search_json(
109 r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
d9b4154c 110 'sigi state', display_id, end_pattern=r'</script>', default={})
111
112 def _get_universal_data(self, webpage, display_id):
113 return traverse_obj(self._search_json(
114 r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
115 'universal data', display_id, end_pattern=r'</script>', default={}),
116 ('__DEFAULT_SCOPE__', {dict})) or {}
a39a7ba8 117
96472d72 118 def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
046cab39 119 note='Downloading API JSON', errnote='Unable to download API page'):
efa944f4 120 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
046cab39
M
121 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
122 if webpage_cookies.get('sid_tt'):
123 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
124 return self._download_json(
add96eb9 125 f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id,
046cab39 126 fatal=fatal, note=note, errnote=errnote, headers={
cb61e20c 127 'User-Agent': self._APP_USER_AGENT,
046cab39 128 'Accept': 'application/json',
96472d72 129 **(headers or {}),
130 }, query=query, data=data)
046cab39 131
cb61e20c 132 def _build_api_query(self, query):
3584b839 133 return filter_dict({
0fd6661e 134 **query,
bd9ff55b 135 'device_platform': 'android',
cb61e20c 136 'os': 'android',
137 'ssmix': 'a',
138 '_rticket': int(time.time() * 1000),
139 'cdid': str(uuid.uuid4()),
140 'channel': 'googleplay',
141 'aid': self._APP_INFO['aid'],
142 'app_name': self._APP_INFO['app_name'],
add96eb9 143 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')),
cb61e20c 144 'version_name': self._APP_INFO['app_version'],
145 'manifest_version_code': self._APP_INFO['manifest_app_version'],
146 'update_version_code': self._APP_INFO['manifest_app_version'],
147 'ab_version': self._APP_INFO['app_version'],
c2a1bdb0 148 'resolution': '1080*2400',
bd9ff55b 149 'dpi': 420,
cb61e20c 150 'device_type': 'Pixel 7',
151 'device_brand': 'Google',
152 'language': 'en',
bd9ff55b 153 'os_api': '29',
cb61e20c 154 'os_version': '13',
155 'ac': 'wifi',
156 'is_pad': '0',
157 'current_region': 'US',
158 'app_type': 'normal',
bd9ff55b 159 'sys_region': 'US',
cb61e20c 160 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
bd9ff55b 161 'timezone_name': 'America/New_York',
cb61e20c 162 'residence': 'US',
163 'app_language': 'en',
bd9ff55b 164 'timezone_offset': '-14400',
cb61e20c 165 'host_abi': 'armeabi-v7a',
166 'locale': 'en',
167 'ac2': 'wifi5g',
168 'uoo': '1',
fc53ec13 169 'carrier_region': 'US',
cb61e20c 170 'op_region': 'US',
171 'build_number': self._APP_INFO['app_version'],
172 'region': 'US',
173 'ts': int(time.time()),
3584b839 174 'iid': self._APP_INFO.get('iid'),
175 'device_id': self._DEVICE_ID,
cb61e20c 176 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
3584b839 177 })
046cab39 178
96472d72 179 def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
046cab39 180 note='Downloading API JSON', errnote='Unable to download API page'):
cb61e20c 181 if not self._APP_INFO and not self._get_next_app_info():
182 message = 'No working app info is available'
183 if fatal:
184 raise ExtractorError(message, expected=True)
185 else:
186 self.report_warning(message)
187 return
188
189 max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
190 for count in itertools.count(1):
191 self.write_debug(str(self._APP_INFO))
96472d72 192 real_query = self._build_api_query(query or {})
046cab39 193 try:
96472d72 194 return self._call_api_impl(
195 ep, video_id, query=real_query, data=data, headers=headers,
196 fatal=fatal, note=note, errnote=errnote)
046cab39
M
197 except ExtractorError as e:
198 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
cb61e20c 199 message = str(e.cause or e.msg)
200 if not self._get_next_app_info():
046cab39 201 if fatal:
cb61e20c 202 raise
046cab39 203 else:
cb61e20c 204 self.report_warning(message)
046cab39 205 return
cb61e20c 206 self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
046cab39 207 continue
cb61e20c 208 raise
0fd6661e 209
ba723997 210 def _extract_aweme_app(self, aweme_id):
96472d72 211 aweme_detail = traverse_obj(
212 self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({
213 'aweme_ids': f'[{aweme_id}]',
214 'request_source': '0',
215 }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict}))
ba723997 216 if not aweme_detail:
96472d72 217 raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id)
ba723997 218 return self._parse_aweme_video_app(aweme_detail)
219
eef1e9f4 220 def _extract_web_data_and_status(self, url, video_id, fatal=True):
ea881297 221 video_data, status = {}, -1
222
223 res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
224 if res is False:
225 return video_data, status
226
227 webpage, urlh = res
228 if urllib.parse.urlparse(urlh.url).path == '/login':
229 message = 'TikTok is requiring login for access to this content'
230 if fatal:
231 self.raise_login_required(message)
232 self.report_warning(f'{message}. {self._login_hint()}')
233 return video_data, status
eef1e9f4 234
235 if universal_data := self._get_universal_data(webpage, video_id):
236 self.write_debug('Found universal data for rehydration')
237 status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
238 video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
239
240 elif sigi_data := self._get_sigi_state(webpage, video_id):
241 self.write_debug('Found sigi state data')
242 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
243 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
244
245 elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
246 self.write_debug('Found next.js data')
247 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
248 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
249
250 elif fatal:
251 raise ExtractorError('Unable to extract webpage video data')
252
253 return video_data, status
254
347f13dd 255 def _get_subtitles(self, aweme_detail, aweme_id, user_name):
e0585e65
M
256 # TODO: Extract text positioning info
257 subtitles = {}
ba723997 258 # aweme/detail endpoint subs
e0585e65 259 captions_info = traverse_obj(
ba723997 260 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
e0585e65
M
261 for caption in captions_info:
262 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
263 if not caption_url:
264 continue
265 caption_json = self._download_json(
266 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
267 if not caption_json:
268 continue
269 subtitles.setdefault(caption.get('language', 'en'), []).append({
270 'ext': 'srt',
271 'data': '\n\n'.join(
272 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
add96eb9 273 for i, line in enumerate(caption_json['utterances']) if line.get('text')),
e0585e65 274 })
ba723997 275 # feed endpoint subs
276 if not subtitles:
277 for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
278 if not caption.get('url'):
279 continue
280 subtitles.setdefault(caption.get('lang') or 'en', []).append({
281 'ext': remove_start(caption.get('caption_format'), 'web'),
282 'url': caption['url'],
283 })
284 # webpage subs
285 if not subtitles:
347f13dd 286 if user_name: # only _parse_aweme_video_app needs to extract the webpage here
eef1e9f4 287 aweme_detail, _ = self._extract_web_data_and_status(
347f13dd 288 self._create_url(user_name, aweme_id), aweme_id, fatal=False)
eef1e9f4 289 for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
ba723997 290 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
291 'ext': remove_start(caption.get('Format'), 'web'),
292 'url': caption['Url'],
293 })
e0585e65
M
294 return subtitles
295
4ccd73fe 296 def _parse_url_key(self, url_key):
297 format_id, codec, res, bitrate = self._search_regex(
298 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
299 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
300 if not format_id:
301 return {}, None
302 return {
303 'format_id': format_id,
304 'vcodec': 'h265' if codec == 'bytevc1' else codec,
305 'tbr': int_or_none(bitrate, scale=1000) or None,
306 'quality': qualities(self.QUALITIES)(res),
307 }, res
308
943d5ab1 309 def _parse_aweme_video_app(self, aweme_detail):
0fd6661e 310 aweme_id = aweme_detail['aweme_id']
bd9ff55b 311 video_info = aweme_detail['video']
bd9ff55b
M
312 known_resolutions = {}
313
b09bd0c1 314 def audio_meta(url):
315 ext = determine_ext(url, default_ext='m4a')
8ceb07e8 316 return {
317 'format_note': 'Music track',
b09bd0c1 318 'ext': ext,
319 'acodec': 'aac' if ext == 'm4a' else ext,
8ceb07e8 320 'vcodec': 'none',
321 'width': None,
322 'height': None,
b09bd0c1 323 } if ext == 'mp3' or '-music-' in url else {}
8ceb07e8 324
bd9ff55b 325 def extract_addr(addr, add_meta={}):
4ccd73fe 326 parsed_meta, res = self._parse_url_key(addr.get('url_key', ''))
63f685f3 327 is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
bd9ff55b 328 if res:
9ff94664 329 known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
330 known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
bd9ff55b
M
331 parsed_meta.update(known_resolutions.get(res, {}))
332 add_meta.setdefault('height', int_or_none(res[:-1]))
333 return [{
334 'url': url,
335 'filesize': int_or_none(addr.get('data_size')),
336 'ext': 'mp4',
337 'acodec': 'aac',
0fd6661e
M
338 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
339 **add_meta, **parsed_meta,
4ccd73fe 340 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
63f685f3 341 'preference': -100 if is_bytevc2 else -1,
34921b43 342 'format_note': join_nonempty(
63f685f3 343 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
344 '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
b09bd0c1 345 **audio_meta(url),
bd9ff55b
M
346 } for url in addr.get('url_list') or []]
347
348 # Hack: Add direct video links first to prioritize them when removing duplicate formats
349 formats = []
9ff94664 350 width = int_or_none(video_info.get('width'))
351 height = int_or_none(video_info.get('height'))
4ccd73fe 352 ratio = try_call(lambda: width / height) or 0.5625
bd9ff55b
M
353 if video_info.get('play_addr'):
354 formats.extend(extract_addr(video_info['play_addr'], {
355 'format_id': 'play_addr',
356 'format_note': 'Direct video',
357 'vcodec': 'h265' if traverse_obj(
be1f331f 358 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
9ff94664 359 'width': width,
360 'height': height,
bd9ff55b
M
361 }))
362 if video_info.get('download_addr'):
9ff94664 363 download_addr = video_info['download_addr']
364 dl_width = int_or_none(download_addr.get('width'))
365 formats.extend(extract_addr(download_addr, {
bd9ff55b
M
366 'format_id': 'download_addr',
367 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
368 'vcodec': 'h264',
4ccd73fe 369 'width': dl_width,
370 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong
0fd6661e 371 'preference': -2 if video_info.get('has_watermark') else -1,
bd9ff55b
M
372 }))
373 if video_info.get('play_addr_h264'):
374 formats.extend(extract_addr(video_info['play_addr_h264'], {
375 'format_id': 'play_addr_h264',
376 'format_note': 'Direct video',
377 'vcodec': 'h264',
378 }))
379 if video_info.get('play_addr_bytevc1'):
380 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
381 'format_id': 'play_addr_bytevc1',
382 'format_note': 'Direct video',
383 'vcodec': 'h265',
384 }))
385
386 for bitrate in video_info.get('bit_rate', []):
387 if bitrate.get('play_addr'):
388 formats.extend(extract_addr(bitrate['play_addr'], {
389 'format_id': bitrate.get('gear_name'),
390 'format_note': 'Playback video',
391 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
392 'vcodec': 'h265' if traverse_obj(
393 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
943d5ab1 394 'fps': bitrate.get('FPS'),
bd9ff55b
M
395 }))
396
397 self._remove_duplicate_formats(formats)
6134fbeb
M
398 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
399 if auth_cookie:
400 for f in formats:
add96eb9 401 self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
bd9ff55b
M
402
403 thumbnails = []
404 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
405 'origin_cover', 'dynamic_cover'):
92593690 406 for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
407 thumbnails.append({
408 'id': cover_id,
409 'url': cover_url,
410 })
411
412 stats_info = aweme_detail.get('statistics') or {}
92593690 413 music_info = aweme_detail.get('music') or {}
6839ae1f 414 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
bd9ff55b
M
415
416 contained_music_track = traverse_obj(
417 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
418 contained_music_author = traverse_obj(
419 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
420
add96eb9 421 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle'))
bd9ff55b
M
422 if is_generic_og_trackname:
423 music_track, music_author = contained_music_track or 'original sound', contained_music_author
424 else:
f4f9f6d0 425 music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str}))
bd9ff55b 426
347f13dd 427 author_info = traverse_obj(aweme_detail, ('author', {
428 'uploader': ('unique_id', {str}),
429 'uploader_id': ('uid', {str_or_none}),
430 'channel': ('nickname', {str}),
431 'channel_id': ('sec_uid', {str}),
432 }))
433
bd9ff55b
M
434 return {
435 'id': aweme_id,
92593690 436 **traverse_obj(aweme_detail, {
437 'title': ('desc', {str}),
438 'description': ('desc', {str}),
439 'timestamp': ('create_time', {int_or_none}),
440 }),
441 **traverse_obj(stats_info, {
442 'view_count': 'play_count',
443 'like_count': 'digg_count',
444 'repost_count': 'share_count',
445 'comment_count': 'comment_count',
446 }, expected_type=int_or_none),
347f13dd 447 **author_info,
448 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
449 'uploader_url': format_field(
450 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
bd9ff55b
M
451 'track': music_track,
452 'album': str_or_none(music_info.get('album')) or None,
f4f9f6d0 453 'artists': re.split(r'(?:, | & )', music_author) if music_author else None,
bd9ff55b 454 'formats': formats,
347f13dd 455 'subtitles': self.extract_subtitles(
456 aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')),
bd9ff55b 457 'thumbnails': thumbnails,
347f13dd 458 'duration': (traverse_obj(video_info, (
459 (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any))
460 or traverse_obj(music_info, ('duration', {int_or_none}))),
53dad39e
M
461 'availability': self._availability(
462 is_private='Private' in labels,
463 needs_subscription='Friends only' in labels,
9f14daf2 464 is_unlisted='Followers only' in labels),
465 '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
bd9ff55b
M
466 }
467
347f13dd 468 def _extract_web_formats(self, aweme_detail):
4ccd73fe 469 COMMON_FORMAT_INFO = {
470 'ext': 'mp4',
471 'vcodec': 'h264',
472 'acodec': 'aac',
473 }
347f13dd 474 video_info = traverse_obj(aweme_detail, ('video', {dict})) or {}
475 play_width = int_or_none(video_info.get('width'))
476 play_height = int_or_none(video_info.get('height'))
477 ratio = try_call(lambda: play_width / play_height) or 0.5625
478 formats = []
4ccd73fe 479
480 for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])):
481 format_info, res = self._parse_url_key(
482 traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '')
483 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
484 is_bytevc2 = format_info.get('vcodec') == 'bytevc2'
485 format_info.update({
486 'format_note': 'UNPLAYABLE' if is_bytevc2 else None,
487 'preference': -100 if is_bytevc2 else -1,
488 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})),
489 })
490
491 if dimension := (res and int(res[:-1])):
492 if dimension == 540: # '540p' is actually 576p
493 dimension = 576
494 if ratio < 1: # portrait: res/dimension is width
495 y = int(dimension / ratio)
496 format_info.update({
497 'width': dimension,
498 'height': y - (y % 2),
499 })
500 else: # landscape: res/dimension is height
501 x = int(dimension * ratio)
502 format_info.update({
347f13dd 503 'width': x + (x % 2),
4ccd73fe 504 'height': dimension,
505 })
506
507 for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})):
508 formats.append({
509 **COMMON_FORMAT_INFO,
510 **format_info,
511 'url': self._proto_relative_url(video_url),
512 })
513
514 # We don't have res string for play formats, but need quality for sorting & de-duplication
347f13dd 515 play_quality = traverse_obj(formats, (lambda _, v: v['width'] == play_width, 'quality', any))
92593690 516
517 for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
518 formats.append({
4ccd73fe 519 **COMMON_FORMAT_INFO,
520 'format_id': 'play',
943d5ab1 521 'url': self._proto_relative_url(play_url),
347f13dd 522 'width': play_width,
523 'height': play_height,
4ccd73fe 524 'quality': play_quality,
92593690 525 })
943d5ab1 526
92593690 527 for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
943d5ab1 528 formats.append({
4ccd73fe 529 **COMMON_FORMAT_INFO,
943d5ab1
M
530 'format_id': 'download',
531 'url': self._proto_relative_url(download_url),
943d5ab1 532 })
92593690 533
943d5ab1 534 self._remove_duplicate_formats(formats)
943d5ab1 535
4ccd73fe 536 for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']):
537 f.update({
538 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '),
539 'preference': f.get('preference') or -2,
540 })
541
542 # Is it a slideshow with only audio for download?
347f13dd 543 if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})):
544 audio_url = aweme_detail['music']['playUrl']
4ccd73fe 545 ext = traverse_obj(parse_qs(audio_url), (
546 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a'
547 formats.append({
548 'format_id': 'audio',
549 'url': self._proto_relative_url(audio_url),
550 'ext': ext,
551 'acodec': 'aac' if ext == 'm4a' else ext,
552 'vcodec': 'none',
553 })
554
347f13dd 555 return formats
556
557 def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False):
558 author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), {
559 'channel': ('nickname', {str}),
560 'channel_id': (('authorSecId', 'secUid'), {str}),
561 'uploader': (('uniqueId', 'author'), {str}),
562 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
563 }), get_all=False)
943d5ab1
M
564
565 return {
92593690 566 'id': video_id,
347f13dd 567 'formats': None if extract_flat else self._extract_web_formats(aweme_detail),
568 'subtitles': None if extract_flat else self.extract_subtitles(aweme_detail, video_id, None),
569 'http_headers': {'Referer': webpage_url},
570 **author_info,
571 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
572 'uploader_url': format_field(
573 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
574 **traverse_obj(aweme_detail, ('music', {
4ccd73fe 575 'track': ('title', {str}),
576 'album': ('album', {str}, {lambda x: x or None}),
347f13dd 577 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}),
4ccd73fe 578 'duration': ('duration', {int_or_none}),
347f13dd 579 })),
92593690 580 **traverse_obj(aweme_detail, {
581 'title': ('desc', {str}),
582 'description': ('desc', {str}),
4ccd73fe 583 # audio-only slideshows have a video duration of 0 and an actual audio duration
584 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}),
92593690 585 'timestamp': ('createTime', {int_or_none}),
586 }),
347f13dd 587 **traverse_obj(aweme_detail, ('stats', {
92593690 588 'view_count': 'playCount',
589 'like_count': 'diggCount',
590 'repost_count': 'shareCount',
591 'comment_count': 'commentCount',
347f13dd 592 }), expected_type=int_or_none),
593 'thumbnails': traverse_obj(aweme_detail, (
594 (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {
595 'url': ({url_or_none}, {self._proto_relative_url}),
596 },
597 )),
943d5ab1
M
598 }
599
0fd6661e
M
600
601class TikTokIE(TikTokBaseIE):
c4cbd3be 602 _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
bfd973ec 603 _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
0fd6661e
M
604
605 _TESTS = [{
606 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
0481e266 607 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
0fd6661e
M
608 'info_dict': {
609 'id': '6748451240264420610',
610 'ext': 'mp4',
611 'title': '#jassmanak #lehanga #leenabhushan',
612 'description': '#jassmanak #lehanga #leenabhushan',
613 'duration': 13,
0481e266 614 'height': 1024,
615 'width': 576,
0fd6661e
M
616 'uploader': 'leenabhushan',
617 'uploader_id': '6691488002098119685',
0481e266 618 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
0fd6661e
M
619 'creator': 'facestoriesbyleenabh',
620 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
621 'upload_date': '20191016',
622 'timestamp': 1571246252,
623 'view_count': int,
624 'like_count': int,
625 'repost_count': int,
626 'comment_count': int,
a44ca5a4 627 'artist': 'Ysrbeats',
628 'album': 'Lehanga',
629 'track': 'Lehanga',
92593690 630 },
631 'skip': '404 Not Found',
0fd6661e
M
632 }, {
633 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
347f13dd 634 'md5': 'f21112672ee4ce05ca390fb6522e1b6f',
0fd6661e
M
635 'info_dict': {
636 'id': '6742501081818877190',
637 'ext': 'mp4',
638 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
639 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
640 'duration': 27,
347f13dd 641 'height': 1024,
642 'width': 576,
0fd6661e
M
643 'uploader': 'patrox',
644 'uploader_id': '18702747',
347f13dd 645 'uploader_url': 'https://www.tiktok.com/@patrox',
646 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
92593690 647 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
f4f9f6d0 648 'channel': 'patroX',
0fd6661e
M
649 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
650 'upload_date': '20190930',
651 'timestamp': 1569860870,
652 'view_count': int,
653 'like_count': int,
654 'repost_count': int,
655 'comment_count': int,
f4f9f6d0 656 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'],
a44ca5a4 657 'track': 'Big Fun',
92593690 658 },
0fd6661e 659 }, {
347f13dd 660 # Banned audio, was available on the app, now works with web too
96f13f01
M
661 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
662 'info_dict': {
663 'id': '6984138651336838402',
664 'ext': 'mp4',
665 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
666 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
667 'uploader': 'barudakhb_',
f4f9f6d0 668 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
96f13f01 669 'uploader_id': '6974687867511718913',
347f13dd 670 'uploader_url': 'https://www.tiktok.com/@barudakhb_',
671 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
92593690 672 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
96f13f01 673 'track': 'Boka Dance',
f4f9f6d0 674 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'],
96f13f01
M
675 'timestamp': 1626121503,
676 'duration': 18,
677 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
678 'upload_date': '20210712',
679 'view_count': int,
680 'like_count': int,
681 'repost_count': int,
682 'comment_count': int,
92593690 683 },
96f13f01
M
684 }, {
685 # Sponsored video, only available with feed workaround
686 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
687 'info_dict': {
688 'id': '7042692929109986561',
689 'ext': 'mp4',
690 'title': 'Slap and Run!',
691 'description': 'Slap and Run!',
692 'uploader': 'user440922249',
f4f9f6d0 693 'channel': 'Slap And Run',
96f13f01
M
694 'uploader_id': '7036055384943690754',
695 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
92593690 696 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
96f13f01
M
697 'track': 'Promoted Music',
698 'timestamp': 1639754738,
699 'duration': 30,
700 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
701 'upload_date': '20211217',
702 'view_count': int,
703 'like_count': int,
704 'repost_count': int,
705 'comment_count': int,
706 },
347f13dd 707 'skip': 'This video is unavailable',
5fa3c9a8
HTL
708 }, {
709 # Video without title and description
710 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
711 'info_dict': {
712 'id': '7059698374567611694',
713 'ext': 'mp4',
b801cd71 714 'title': 'TikTok video #7059698374567611694',
5fa3c9a8
HTL
715 'description': '',
716 'uploader': 'pokemonlife22',
f4f9f6d0 717 'channel': 'Pokemon',
5fa3c9a8 718 'uploader_id': '6820838815978423302',
347f13dd 719 'uploader_url': 'https://www.tiktok.com/@pokemonlife22',
720 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
92593690 721 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8
HTL
722 'track': 'original sound',
723 'timestamp': 1643714123,
724 'duration': 6,
725 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
726 'upload_date': '20220201',
f4f9f6d0 727 'artists': ['Pokemon'],
5fa3c9a8
HTL
728 'view_count': int,
729 'like_count': int,
730 'repost_count': int,
731 'comment_count': int,
732 },
a39a7ba8 733 }, {
734 # hydration JSON is sent in a <script> element
735 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
736 'info_dict': {
737 'id': '7065799023130643713',
738 'ext': 'mp4',
739 'title': '#denidil#денидил',
740 'description': '#denidil#денидил',
741 'uploader': 'denidil6',
742 'uploader_id': '7046664115636405250',
743 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
744 'artist': 'Holocron Music',
745 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
746 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
747 'timestamp': 1645134536,
748 'duration': 26,
749 'upload_date': '20220217',
750 'view_count': int,
751 'like_count': int,
752 'repost_count': int,
753 'comment_count': int,
754 },
f7c5a5e9 755 'skip': 'This video is unavailable',
8ceb07e8 756 }, {
757 # slideshow audio-only mp3 format
758 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
759 'info_dict': {
760 'id': '7139980461132074283',
761 'ext': 'mp3',
762 'title': 'TikTok video #7139980461132074283',
763 'description': '',
f4f9f6d0 764 'channel': 'Antaura',
8ceb07e8 765 'uploader': '_le_cannibale_',
766 'uploader_id': '6604511138619654149',
347f13dd 767 'uploader_url': 'https://www.tiktok.com/@_le_cannibale_',
768 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
92593690 769 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
f4f9f6d0 770 'artists': ['nathan !'],
8ceb07e8 771 'track': 'grahamscott canon',
347f13dd 772 'duration': 10,
8ceb07e8 773 'upload_date': '20220905',
774 'timestamp': 1662406249,
775 'view_count': int,
776 'like_count': int,
777 'repost_count': int,
778 'comment_count': int,
f4f9f6d0 779 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
8ceb07e8 780 },
92593690 781 }, {
782 # only available via web
347f13dd 783 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
784 'md5': '4cdefa501ac8ac20bf04986e10916fea',
92593690 785 'info_dict': {
786 'id': '7206382937372134662',
787 'ext': 'mp4',
788 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
789 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
f4f9f6d0 790 'channel': 'MoxyPatch',
92593690 791 'uploader': 'moxypatch',
792 'uploader_id': '7039142049363379205',
347f13dd 793 'uploader_url': 'https://www.tiktok.com/@moxypatch',
794 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
92593690 795 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
d9b4154c 796 'artists': ['your worst nightmare'],
92593690 797 'track': 'original sound',
798 'upload_date': '20230303',
799 'timestamp': 1677866781,
800 'duration': 10,
801 'view_count': int,
802 'like_count': int,
803 'repost_count': int,
804 'comment_count': int,
805 'thumbnail': r're:^https://.+',
806 'thumbnails': 'count:3',
807 },
808 'expected_warnings': ['Unable to find video in feed'],
c2a1bdb0 809 }, {
810 # 1080p format
add96eb9 811 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio
c2a1bdb0 812 'md5': '982512017a8a917124d5a08c8ae79621',
813 'info_dict': {
814 'id': '7107337212743830830',
815 'ext': 'mp4',
816 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
817 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
818 'uploader': 'tatemcrae',
819 'uploader_id': '86328792343818240',
820 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
821 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
f4f9f6d0 822 'channel': 'tate mcrae',
f4f9f6d0 823 'artists': ['tate mcrae'],
c2a1bdb0 824 'track': 'original sound',
825 'upload_date': '20220609',
826 'timestamp': 1654805899,
827 'duration': 150,
828 'view_count': int,
829 'like_count': int,
830 'repost_count': int,
831 'comment_count': int,
832 'thumbnail': r're:^https://.+\.webp',
833 },
347f13dd 834 'skip': 'Unavailable via feed API, only audio available via web',
b09bd0c1 835 }, {
836 # Slideshow, audio-only m4a format
837 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
838 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
839 'info_dict': {
840 'id': '7253412088251534594',
841 'ext': 'm4a',
842 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
843 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
844 'uploader': 'hara_yoimiya',
845 'uploader_id': '6582536342634676230',
347f13dd 846 'uploader_url': 'https://www.tiktok.com/@hara_yoimiya',
847 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
b09bd0c1 848 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
347f13dd 849 'channel': 'лампочка(!)',
f4f9f6d0 850 'artists': ['Øneheart'],
b09bd0c1 851 'album': 'watching the stars',
852 'track': 'watching the stars',
347f13dd 853 'duration': 60,
b09bd0c1 854 'upload_date': '20230708',
855 'timestamp': 1688816612,
856 'view_count': int,
857 'like_count': int,
858 'comment_count': int,
859 'repost_count': int,
f4f9f6d0 860 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
b09bd0c1 861 },
e0585e65
M
862 }, {
863 # Auto-captions available
864 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
add96eb9 865 'only_matching': True,
0fd6661e
M
866 }]
867
ce18a19b 868 def _real_extract(self, url):
b801cd71 869 video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
41ba4a80 870
871 if self._KNOWN_APP_INFO:
872 try:
873 return self._extract_aweme_app(video_id)
874 except ExtractorError as e:
875 e.expected = True
876 self.report_warning(f'{e}; trying with webpage')
bd9ff55b 877
a39a7ba8 878 url = self._create_url(user_id, video_id)
eef1e9f4 879 video_data, status = self._extract_web_data_and_status(url, video_id)
11aa91a1 880
d9b4154c 881 if video_data and status == 0:
92593690 882 return self._parse_aweme_video_web(video_data, url, video_id)
1418a043 883 elif status == 10216:
884 raise ExtractorError('This video is private', expected=True)
d9b4154c 885 raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
f7f18f90
A
886
887
0fd6661e 888class TikTokUserIE(TikTokBaseIE):
f7f18f90 889 IE_NAME = 'tiktok:user'
347f13dd 890 _VALID_URL = r'(?:tiktokuser:|https?://(?:www\.)?tiktok\.com/@)(?P<id>[\w.-]+)/?(?:$|[#?])'
f7f18f90 891 _TESTS = [{
526d74ec 892 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
893 'playlist_mincount': 45,
894 'info_dict': {
347f13dd 895 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
0481e266 896 'title': 'corgibobaa',
f7f18f90 897 },
5fa3c9a8
HTL
898 }, {
899 'url': 'https://www.tiktok.com/@6820838815978423302',
900 'playlist_mincount': 5,
901 'info_dict': {
347f13dd 902 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8 903 'title': '6820838815978423302',
5fa3c9a8 904 },
f7f18f90
A
905 }, {
906 'url': 'https://www.tiktok.com/@meme',
907 'playlist_mincount': 593,
908 'info_dict': {
347f13dd 909 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
0481e266 910 'title': 'meme',
f7f18f90 911 },
347f13dd 912 }, {
913 'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
914 'playlist_mincount': 31,
915 'info_dict': {
916 'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
917 },
f7f18f90 918 }]
347f13dd 919 _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
920 _API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
f7f18f90 921
347f13dd 922 def _build_web_query(self, sec_uid, cursor):
923 return {
924 'aid': '1988',
925 'app_language': 'en',
926 'app_name': 'tiktok_web',
927 'browser_language': 'en-US',
928 'browser_name': 'Mozilla',
929 'browser_online': 'true',
930 'browser_platform': 'Win32',
931 'browser_version': '5.0 (Windows)',
932 'channel': 'tiktok_web',
933 'cookie_enabled': 'true',
934 'count': '15',
935 'cursor': cursor,
936 'device_id': self._DEVICE_ID,
937 'device_platform': 'web_pc',
938 'focus_state': 'true',
939 'from_page': 'user',
940 'history_len': '2',
941 'is_fullscreen': 'false',
942 'is_page_visible': 'true',
943 'language': 'en',
944 'os': 'windows',
945 'priority_region': '',
946 'referer': '',
947 'region': 'US',
948 'screen_height': '1080',
949 'screen_width': '1920',
950 'secUid': sec_uid,
951 'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
952 'tz_name': 'UTC',
953 'verifyFp': f'verify_{"".join(random.choices(string.hexdigits, k=7))}',
954 'webcast_language': 'en',
0fd6661e
M
955 }
956
347f13dd 957 def _entries(self, sec_uid, user_name):
958 display_id = user_name or sec_uid
c53c2e40 959 seen_ids = set()
347f13dd 960
961 cursor = int(time.time() * 1E3)
0fd6661e 962 for page in itertools.count(1):
347f13dd 963 response = self._download_json(
964 self._API_BASE_URL, display_id, f'Downloading page {page}',
965 query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
b3187433 966
347f13dd 967 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
968 video_id = video['id']
c53c2e40 969 if video_id in seen_ids:
970 continue
971 seen_ids.add(video_id)
347f13dd 972 webpage_url = self._create_url(display_id, video_id)
973 yield self.url_result(
974 webpage_url, TikTokIE,
975 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
976
977 old_cursor = cursor
978 cursor = traverse_obj(
c53c2e40 979 response, ('itemList', -1, 'createTime', {lambda x: int(x * 1E3)}))
980 if not cursor or old_cursor == cursor:
347f13dd 981 # User may not have posted within this ~1 week lookback, so manually adjust cursor
982 cursor = old_cursor - 7 * 86_400_000
983 # In case 'hasMorePrevious' is wrong, break if we have gone back before TikTok existed
984 if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
985 break
b3187433 986
347f13dd 987 def _get_sec_uid(self, user_url, user_name, msg):
988 webpage = self._download_webpage(
989 user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
990 note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
991 return (traverse_obj(self._get_universal_data(webpage, user_name),
992 ('webapp.user-detail', 'userInfo', 'user', 'secUid', {str}))
993 or traverse_obj(self._get_sigi_state(webpage, user_name),
994 ('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid', {str}),
995 ('UserModule', 'users', ..., 'secUid', {str}, any)))
b3187433 996
347f13dd 997 def _real_extract(self, url):
998 user_name, sec_uid = self._match_id(url), None
999 if mobj := re.fullmatch(r'MS4wLjABAAAA[\w-]{64}', user_name):
1000 user_name, sec_uid = None, mobj.group(0)
1001 else:
1002 sec_uid = (self._get_sec_uid(self._UPLOADER_URL_FORMAT % user_name, user_name, 'user')
1003 or self._get_sec_uid(self._UPLOADER_URL_FORMAT % f'{user_name}/live', user_name, 'live'))
1004
1005 if not sec_uid:
1006 webpage = self._download_webpage(
1007 f'https://www.tiktok.com/embed/@{user_name}', user_name,
1008 note='Downloading user embed page', fatal=False) or ''
1009 data = traverse_obj(self._search_json(
1010 r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
1011 webpage, 'data', user_name, default={}),
1012 ('source', 'data', f'/embed/@{user_name}', {dict}))
1013
1014 for aweme_id in traverse_obj(data, ('videoList', ..., 'id', {str})):
1015 webpage_url = self._create_url(user_name, aweme_id)
1016 video_data, _ = self._extract_web_data_and_status(webpage_url, aweme_id, fatal=False)
1017 sec_uid = self._parse_aweme_video_web(
1018 video_data, webpage_url, aweme_id, extract_flat=True).get('channel_id')
1019 if sec_uid:
1020 break
1021
1022 if not sec_uid:
1023 raise ExtractorError(
1024 'Unable to extract secondary user ID. If you are able to get the channel_id '
1025 'from a video posted by this user, try using "tiktokuser:channel_id" as the '
1026 'input URL (replacing `channel_id` with its actual value)', expected=True)
1027
1028 return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
943d5ab1
M
1029
1030
6368e2e6 1031class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
8126298c
M
1032 def _entries(self, list_id, display_id):
1033 query = {
1034 self._QUERY_NAME: list_id,
1035 'cursor': 0,
1036 'count': 20,
1037 'type': 5,
3584b839 1038 'device_id': self._DEVICE_ID,
8126298c
M
1039 }
1040
8126298c 1041 for page in itertools.count(1):
be5c1ae8 1042 for retry in self.RetryManager():
8126298c 1043 try:
be5c1ae8 1044 post_list = self._call_api(
96472d72 1045 self._API_ENDPOINT, display_id, query=query,
1046 note=f'Downloading video list page {page}',
be5c1ae8 1047 errnote='Unable to download video list')
8126298c 1048 except ExtractorError as e:
be5c1ae8 1049 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
1050 retry.error = e
8126298c
M
1051 continue
1052 raise
8126298c
M
1053 for video in post_list.get('aweme_list', []):
1054 yield {
1055 **self._parse_aweme_video_app(video),
0b77924a 1056 'extractor_key': TikTokIE.ie_key(),
8126298c
M
1057 'extractor': 'TikTok',
1058 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
1059 }
1060 if not post_list.get('has_more'):
1061 break
1062 query['cursor'] = post_list['cursor']
1063
1064 def _real_extract(self, url):
1065 list_id = self._match_id(url)
1066 return self.playlist_result(self._entries(list_id, list_id), list_id)
1067
1068
1069class TikTokSoundIE(TikTokBaseListIE):
1070 IE_NAME = 'tiktok:sound'
1071 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1072 _WORKING = False
8126298c
M
1073 _QUERY_NAME = 'music_id'
1074 _API_ENDPOINT = 'music/aweme'
1075 _TESTS = [{
1076 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
1077 'playlist_mincount': 100,
1078 'info_dict': {
add96eb9 1079 'id': '6956990112127585029',
8126298c 1080 },
add96eb9 1081 'expected_warnings': ['Retrying'],
8126298c
M
1082 }, {
1083 # Actual entries are less than listed video count
1084 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
1085 'playlist_mincount': 2182,
1086 'info_dict': {
add96eb9 1087 'id': '7036843036118469381',
8126298c 1088 },
add96eb9 1089 'expected_warnings': ['Retrying'],
8126298c
M
1090 }]
1091
1092
1093class TikTokEffectIE(TikTokBaseListIE):
1094 IE_NAME = 'tiktok:effect'
1095 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1096 _WORKING = False
8126298c
M
1097 _QUERY_NAME = 'sticker_id'
1098 _API_ENDPOINT = 'sticker/aweme'
1099 _TESTS = [{
1100 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
1101 'playlist_mincount': 100,
1102 'info_dict': {
1103 'id': '1258156',
1104 },
add96eb9 1105 'expected_warnings': ['Retrying'],
8126298c
M
1106 }, {
1107 # Different entries between mobile and web, depending on region
1108 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
add96eb9 1109 'only_matching': True,
8126298c
M
1110 }]
1111
1112
1113class TikTokTagIE(TikTokBaseListIE):
1114 IE_NAME = 'tiktok:tag'
1115 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
f7c5a5e9 1116 _WORKING = False
8126298c
M
1117 _QUERY_NAME = 'ch_id'
1118 _API_ENDPOINT = 'challenge/aweme'
1119 _TESTS = [{
1120 'url': 'https://tiktok.com/tag/hello2018',
1121 'playlist_mincount': 39,
1122 'info_dict': {
1123 'id': '46294678',
1124 'title': 'hello2018',
1125 },
add96eb9 1126 'expected_warnings': ['Retrying'],
8126298c
M
1127 }, {
1128 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
add96eb9 1129 'only_matching': True,
8126298c
M
1130 }]
1131
1132 def _real_extract(self, url):
1133 display_id = self._match_id(url)
1134 webpage = self._download_webpage(url, display_id, headers={
add96eb9 1135 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
8126298c
M
1136 })
1137 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
1138 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
1139
1140
119d41f2 1141class TikTokCollectionIE(TikTokBaseIE):
1142 IE_NAME = 'tiktok:collection'
1143 _VALID_URL = r'https?://www\.tiktok\.com/@(?P<user_id>[\w.-]+)/collection/(?P<title>[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)'
1144 _TESTS = [{
1145 # playlist should have exactly 9 videos
1146 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462',
1147 'info_dict': {
1148 'id': '7371330159376370462',
add96eb9 1149 'title': 'imanoreotwe-count-test',
119d41f2 1150 },
add96eb9 1151 'playlist_count': 9,
119d41f2 1152 }, {
1153 # tests returning multiple pages of a large collection
1154 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875',
1155 'info_dict': {
1156 'id': '7111887189571160875',
add96eb9 1157 'title': 'imanoreotwe-%F0%9F%98%82',
119d41f2 1158 },
add96eb9 1159 'playlist_mincount': 100,
119d41f2 1160 }]
1161 _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/'
1162 _PAGE_COUNT = 30
1163
1164 def _build_web_query(self, collection_id, cursor):
1165 return {
1166 'aid': '1988',
1167 'collectionId': collection_id,
1168 'count': self._PAGE_COUNT,
1169 'cursor': cursor,
1170 'sourceType': '113',
1171 }
1172
1173 def _entries(self, collection_id):
1174 cursor = 0
1175 for page in itertools.count(1):
1176 response = self._download_json(
1177 self._API_BASE_URL, collection_id, f'Downloading page {page}',
1178 query=self._build_web_query(collection_id, cursor))
1179
1180 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
1181 video_id = video['id']
1182 author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_'
1183 webpage_url = self._create_url(author, video_id)
1184 yield self.url_result(
1185 webpage_url, TikTokIE,
1186 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
1187
1188 if not traverse_obj(response, 'hasMore'):
1189 break
1190 cursor += self._PAGE_COUNT
1191
1192 def _real_extract(self, url):
1193 collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id')
1194
1195 return self.playlist_result(
1196 self._entries(collection_id), collection_id, '-'.join((user_name, title)))
1197
1198
ba723997 1199class DouyinIE(TikTokBaseIE):
943d5ab1
M
1200 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
1201 _TESTS = [{
1202 'url': 'https://www.douyin.com/video/6961737553342991651',
9ff94664 1203 'md5': '9ecce7bc5b302601018ecb2871c63a75',
943d5ab1
M
1204 'info_dict': {
1205 'id': '6961737553342991651',
1206 'ext': 'mp4',
1207 'title': '#杨超越 小小水手带你去远航❤️',
ba723997 1208 'description': '#杨超越 小小水手带你去远航❤️',
9ff94664 1209 'uploader': '6897520xka',
943d5ab1 1210 'uploader_id': '110403406559',
ba723997 1211 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1212 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1213 'channel': '杨超越',
9ff94664 1214 'duration': 19,
ba723997 1215 'timestamp': 1620905839,
1216 'upload_date': '20210513',
1217 'track': '@杨超越创作的原声',
9ff94664 1218 'artists': ['杨超越'],
943d5ab1
M
1219 'view_count': int,
1220 'like_count': int,
1221 'repost_count': int,
1222 'comment_count': int,
92593690 1223 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1224 },
943d5ab1
M
1225 }, {
1226 'url': 'https://www.douyin.com/video/6982497745948921092',
9ff94664 1227 'md5': '15c5e660b7048af3707304e3cc02bbb5',
943d5ab1
M
1228 'info_dict': {
1229 'id': '6982497745948921092',
1230 'ext': 'mp4',
1231 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
ba723997 1232 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
9ff94664 1233 'uploader': '0731chaoyue',
943d5ab1 1234 'uploader_id': '408654318141572',
ba723997 1235 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
92593690 1236 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
f4f9f6d0 1237 'channel': '杨超越工作室',
9ff94664 1238 'duration': 42,
ba723997 1239 'timestamp': 1625739481,
1240 'upload_date': '20210708',
1241 'track': '@杨超越工作室创作的原声',
9ff94664 1242 'artists': ['杨超越工作室'],
943d5ab1
M
1243 'view_count': int,
1244 'like_count': int,
1245 'repost_count': int,
1246 'comment_count': int,
92593690 1247 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1248 },
943d5ab1
M
1249 }, {
1250 'url': 'https://www.douyin.com/video/6953975910773099811',
9ff94664 1251 'md5': '0e6443758b8355db9a3c34864a4276be',
943d5ab1
M
1252 'info_dict': {
1253 'id': '6953975910773099811',
1254 'ext': 'mp4',
1255 'title': '#一起看海 出现在你的夏日里',
ba723997 1256 'description': '#一起看海 出现在你的夏日里',
9ff94664 1257 'uploader': '6897520xka',
943d5ab1 1258 'uploader_id': '110403406559',
ba723997 1259 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1260 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1261 'channel': '杨超越',
9ff94664 1262 'duration': 17,
ba723997 1263 'timestamp': 1619098692,
1264 'upload_date': '20210422',
1265 'track': '@杨超越创作的原声',
9ff94664 1266 'artists': ['杨超越'],
943d5ab1
M
1267 'view_count': int,
1268 'like_count': int,
1269 'repost_count': int,
1270 'comment_count': int,
92593690 1271 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1272 },
943d5ab1
M
1273 }, {
1274 'url': 'https://www.douyin.com/video/6950251282489675042',
1275 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
1276 'info_dict': {
1277 'id': '6950251282489675042',
1278 'ext': 'mp4',
1279 'title': '哈哈哈,成功了哈哈哈哈哈哈',
1280 'uploader': '杨超越',
1281 'upload_date': '20210412',
1282 'timestamp': 1618231483,
1283 'uploader_id': '110403406559',
1284 'view_count': int,
1285 'like_count': int,
1286 'repost_count': int,
1287 'comment_count': int,
ba723997 1288 },
1289 'skip': 'No longer available',
943d5ab1
M
1290 }, {
1291 'url': 'https://www.douyin.com/video/6963263655114722595',
9ff94664 1292 'md5': '1440bcf59d8700f8e014da073a4dfea8',
943d5ab1
M
1293 'info_dict': {
1294 'id': '6963263655114722595',
1295 'ext': 'mp4',
1296 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
ba723997 1297 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
9ff94664 1298 'uploader': '6897520xka',
943d5ab1 1299 'uploader_id': '110403406559',
ba723997 1300 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1301 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1302 'channel': '杨超越',
9ff94664 1303 'duration': 15,
ba723997 1304 'timestamp': 1621261163,
1305 'upload_date': '20210517',
1306 'track': '@杨超越创作的原声',
9ff94664 1307 'artists': ['杨超越'],
943d5ab1
M
1308 'view_count': int,
1309 'like_count': int,
1310 'repost_count': int,
1311 'comment_count': int,
92593690 1312 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1313 },
943d5ab1 1314 }]
943d5ab1 1315 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
53dad39e 1316 _WEBPAGE_HOST = 'https://www.douyin.com/'
943d5ab1
M
1317
1318 def _real_extract(self, url):
1319 video_id = self._match_id(url)
1320
9ff94664 1321 detail = traverse_obj(self._download_json(
1322 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
1323 'Downloading web detail JSON', 'Failed to download web detail JSON',
1324 query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
1325 if not detail:
943d5ab1 1326 # TODO: Run verification challenge code to generate signature cookies
ba723997 1327 raise ExtractorError(
9ff94664 1328 'Fresh cookies (not necessarily logged in) are needed',
1329 expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
943d5ab1 1330
9ff94664 1331 return self._parse_aweme_video_app(detail)
88afe056 1332
1333
49895f06 1334class TikTokVMIE(InfoExtractor):
ba723997 1335 _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
88afe056 1336 IE_NAME = 'vm.tiktok'
1337
49895f06 1338 _TESTS = [{
ba723997 1339 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
49895f06 1340 'info_dict': {
ba723997 1341 'id': '7170520270497680683',
49895f06 1342 'ext': 'mp4',
ba723997 1343 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
1344 'uploader_id': '6687535061741700102',
1345 'upload_date': '20221127',
49895f06 1346 'view_count': int,
ba723997 1347 'like_count': int,
49895f06 1348 'comment_count': int,
ba723997 1349 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
1350 'album': 'Wave of Mutilation: Best of Pixies',
1351 'thumbnail': r're:https://.+\.webp.*',
1352 'duration': 5,
1353 'timestamp': 1669516858,
49895f06 1354 'repost_count': int,
ba723997 1355 'artist': 'Pixies',
1356 'track': 'Where Is My Mind?',
1357 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
1358 'uploader': 'sigmachaddeus',
1359 'creator': 'SigmaChad',
1360 },
1361 }, {
c4cbd3be 1362 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
1363 'info_dict': {
1364 'id': '7106798200794926362',
1365 'ext': 'mp4',
1366 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
1367 'uploader_id': '6997695878846268418',
1368 'upload_date': '20220608',
1369 'view_count': int,
1370 'like_count': int,
1371 'comment_count': int,
1372 'thumbnail': r're:https://.+\.webp.*',
1373 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
1374 'duration': 29,
1375 'timestamp': 1654680400,
1376 'repost_count': int,
1377 'artist': 'Akihitoko',
1378 'track': 'original sound',
1379 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
1380 'uploader': 'akihitoko1',
1381 'creator': 'Akihitoko',
1382 },
49895f06 1383 }, {
1384 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
1385 'only_matching': True,
1386 }]
1387
88afe056 1388 def _real_extract(self, url):
11e1c2e3 1389 new_url = self._request_webpage(
3d2623a8 1390 HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
11e1c2e3 1391 if self.suitable(new_url): # Prevent infinite loop in case redirect fails
1392 raise UnsupportedError(new_url)
1393 return self.url_result(new_url)
933ed882
JC
1394
1395
216bcb66 1396class TikTokLiveIE(TikTokBaseIE):
1397 _VALID_URL = r'''(?x)https?://(?:
1398 (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
1399 m\.tiktok\.com/share/live/(?P<id>\d+)
1400 )'''
933ed882
JC
1401 IE_NAME = 'tiktok:live'
1402
1403 _TESTS = [{
216bcb66 1404 'url': 'https://www.tiktok.com/@weathernewslive/live',
1405 'info_dict': {
1406 'id': '7210809319192726273',
1407 'ext': 'mp4',
1408 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
1409 'creator': 'ウェザーニュースLiVE',
1410 'uploader': 'weathernewslive',
1411 'uploader_id': '6621496731283095554',
1412 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
1413 'live_status': 'is_live',
1414 'concurrent_view_count': int,
1415 },
1416 'params': {'skip_download': 'm3u8'},
1417 }, {
1418 'url': 'https://www.tiktok.com/@pilarmagenta/live',
1419 'info_dict': {
1420 'id': '7209423610325322522',
1421 'ext': 'mp4',
1422 'title': str,
1423 'creator': 'Pilarmagenta',
1424 'uploader': 'pilarmagenta',
1425 'uploader_id': '6624846890674683909',
1426 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
1427 'live_status': 'is_live',
1428 'concurrent_view_count': int,
1429 },
1430 'skip': 'Livestream',
1431 }, {
1432 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
1433 'only_matching': True,
1434 }, {
933ed882
JC
1435 'url': 'https://www.tiktok.com/@iris04201/live',
1436 'only_matching': True,
1437 }]
1438
216bcb66 1439 def _call_api(self, url, param, room_id, uploader, key=None):
1440 response = traverse_obj(self._download_json(
1441 url, room_id, fatal=False, query={
1442 'aid': '1988',
1443 param: room_id,
1444 }), (key, {dict}), default={})
1445
1446 # status == 2 if live else 4
1447 if int_or_none(response.get('status')) == 2:
1448 return response
1449 # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
1450 elif not uploader:
1451 raise ExtractorError('This livestream has ended', expected=True)
1452 raise UserNotLive(video_id=uploader)
1453
933ed882 1454 def _real_extract(self, url):
216bcb66 1455 uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
1456 webpage = self._download_webpage(
1457 url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
1458
1459 if webpage:
d9b4154c 1460 data = self._get_sigi_state(webpage, uploader or room_id)
216bcb66 1461 room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
1462 or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
1463 or room_id)
1464 uploader = uploader or traverse_obj(
1465 data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
1466 ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
1467
933ed882
JC
1468 if not room_id:
1469 raise UserNotLive(video_id=uploader)
933ed882 1470
216bcb66 1471 formats = []
1472 live_info = self._call_api(
1473 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
1474
1475 get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
1476 parse_inner = lambda x: self._parse_json(x, None)
1477
1478 for quality, stream in traverse_obj(live_info, (
1479 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
1480 {parse_inner}, 'data', {dict}), default={}).items():
1481
1482 sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
1483 'vcodec': ('VCodec', {str}),
1484 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
1485 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
1486 }))
1487
1488 flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
1489 if flv_url:
1490 formats.append({
1491 'url': flv_url,
1492 'ext': 'flv',
1493 'format_id': f'flv-{quality}',
1494 'quality': get_quality(quality),
1495 **sdk_params,
1496 })
1497
1498 hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
1499 if hls_url:
1500 formats.append({
1501 'url': hls_url,
1502 'ext': 'mp4',
1503 'protocol': 'm3u8_native',
1504 'format_id': f'hls-{quality}',
1505 'quality': get_quality(quality),
1506 **sdk_params,
1507 })
1508
1509 def get_vcodec(*keys):
1510 return traverse_obj(live_info, (
1511 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
1512
1513 for stream in ('hls', 'rtmp'):
1514 stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
1515 if stream_url:
1516 formats.append({
1517 'url': stream_url,
1518 'ext': 'mp4' if stream == 'hls' else 'flv',
1519 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
1520 'format_id': f'{stream}-pull',
1521 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
1522 'quality': get_quality('ORIGION'),
1523 })
1524
1525 for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
1526 if not url_or_none(f_url):
1527 continue
1528 formats.append({
1529 'url': f_url,
1530 'ext': 'flv',
1531 'format_id': f'flv-{f_id}'.lower(),
1532 'vcodec': get_vcodec('flv_pull_url_params', f_id),
1533 'quality': get_quality(f_id),
1534 })
1535
1536 # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
1537 if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
1538 live_info = merge_dicts(live_info, self._call_api(
1539 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
1540 if url_or_none(live_info.get('liveUrl')):
1541 formats.append({
1542 'url': live_info['liveUrl'],
1543 'ext': 'mp4',
1544 'protocol': 'm3u8_native',
1545 'format_id': 'hls-fallback',
1546 'vcodec': 'h264',
1547 'quality': get_quality('origin'),
1548 })
1549
1550 uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
933ed882
JC
1551
1552 return {
1553 'id': room_id,
933ed882 1554 'uploader': uploader,
216bcb66 1555 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
933ed882 1556 'is_live': True,
216bcb66 1557 'formats': formats,
1558 '_format_sort_fields': ('quality', 'ext'),
1559 **traverse_obj(live_info, {
1560 'title': 'title',
1561 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
1562 'creator': (('ownerInfo', 'owner'), 'nickname'),
1563 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
1564 }, get_all=False),
933ed882 1565 }