]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[ie/generic] Add `key_query` extractor-arg
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
3584b839 1import functools
f7f18f90 2import itertools
b801cd71 3import json
bd9ff55b 4import random
216bcb66 5import re
347f13dd 6import string
bd9ff55b 7import time
add96eb9 8import urllib.parse
cb61e20c 9import uuid
1ead840d
KS
10
11from .common import InfoExtractor
3d2623a8 12from ..networking import HEADRequest
1ead840d 13from ..utils import (
ce18a19b 14 ExtractorError,
11e1c2e3 15 UnsupportedError,
933ed882 16 UserNotLive,
8ceb07e8 17 determine_ext,
3584b839 18 filter_dict,
216bcb66 19 format_field,
1ead840d 20 int_or_none,
34921b43 21 join_nonempty,
216bcb66 22 merge_dicts,
4ccd73fe 23 mimetype2ext,
24 parse_qs,
b801cd71 25 qualities,
ba723997 26 remove_start,
e0585e65 27 srt_subtitles_timecode,
1ead840d 28 str_or_none,
bd9ff55b 29 traverse_obj,
216bcb66 30 try_call,
bd9ff55b 31 try_get,
943d5ab1 32 url_or_none,
1ead840d
KS
33)
34
35
0fd6661e 36class TikTokBaseIE(InfoExtractor):
943d5ab1 37 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
53dad39e 38 _WEBPAGE_HOST = 'https://www.tiktok.com/'
be1f331f 39 QUALITIES = ('360p', '540p', '720p', '1080p')
ce18a19b 40
cb61e20c 41 _APP_INFO_DEFAULTS = {
42 # unique "install id"
43 'iid': None,
44 # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
45 'app_name': 'musical_ly',
46 'app_version': '34.1.2',
47 'manifest_app_version': '2023401020',
48 # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
49 'aid': '0',
50 }
cb61e20c 51 _APP_INFO_POOL = None
52 _APP_INFO = None
53 _APP_USER_AGENT = None
54
3584b839 55 @functools.cached_property
41ba4a80 56 def _KNOWN_APP_INFO(self):
3584b839 57 # If we have a genuine device ID, we may not need any IID
58 default = [''] if self._KNOWN_DEVICE_ID else []
59 return self._configuration_arg('app_info', default, ie_key=TikTokIE)
41ba4a80 60
3584b839 61 @functools.cached_property
62 def _KNOWN_DEVICE_ID(self):
63 return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0]
64
65 @functools.cached_property
66 def _DEVICE_ID(self):
67 return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000))
68
69 @functools.cached_property
c4cbd3be 70 def _API_HOSTNAME(self):
71 return self._configuration_arg(
41ba4a80 72 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
c4cbd3be 73
cb61e20c 74 def _get_next_app_info(self):
75 if self._APP_INFO_POOL is None:
76 defaults = {
77 key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
78 for key, default in self._APP_INFO_DEFAULTS.items()
79 if key != 'iid'
80 }
cb61e20c 81 self._APP_INFO_POOL = [
82 {**defaults, **dict(
83 (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
41ba4a80 84 )} for app_info in self._KNOWN_APP_INFO
cb61e20c 85 ]
86
87 if not self._APP_INFO_POOL:
88 return False
89
90 self._APP_INFO = self._APP_INFO_POOL.pop(0)
91
92 app_name = self._APP_INFO['app_name']
93 version = self._APP_INFO['manifest_app_version']
94 if app_name == 'musical_ly':
95 package = f'com.zhiliaoapp.musically/{version}'
96 else: # trill, aweme
97 package = f'com.ss.android.ugc.{app_name}/{version}'
98 self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
99
100 return True
101
b801cd71 102 @staticmethod
103 def _create_url(user_id, video_id):
104 return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
105
a39a7ba8 106 def _get_sigi_state(self, webpage, display_id):
069cbece 107 return self._search_json(
108 r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
d9b4154c 109 'sigi state', display_id, end_pattern=r'</script>', default={})
110
111 def _get_universal_data(self, webpage, display_id):
112 return traverse_obj(self._search_json(
113 r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
114 'universal data', display_id, end_pattern=r'</script>', default={}),
115 ('__DEFAULT_SCOPE__', {dict})) or {}
a39a7ba8 116
cb61e20c 117 def _call_api_impl(self, ep, query, video_id, fatal=True,
046cab39 118 note='Downloading API JSON', errnote='Unable to download API page'):
efa944f4 119 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
046cab39
M
120 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
121 if webpage_cookies.get('sid_tt'):
122 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
123 return self._download_json(
add96eb9 124 f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id,
046cab39 125 fatal=fatal, note=note, errnote=errnote, headers={
cb61e20c 126 'User-Agent': self._APP_USER_AGENT,
046cab39
M
127 'Accept': 'application/json',
128 }, query=query)
129
cb61e20c 130 def _build_api_query(self, query):
3584b839 131 return filter_dict({
0fd6661e 132 **query,
bd9ff55b 133 'device_platform': 'android',
cb61e20c 134 'os': 'android',
135 'ssmix': 'a',
136 '_rticket': int(time.time() * 1000),
137 'cdid': str(uuid.uuid4()),
138 'channel': 'googleplay',
139 'aid': self._APP_INFO['aid'],
140 'app_name': self._APP_INFO['app_name'],
add96eb9 141 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')),
cb61e20c 142 'version_name': self._APP_INFO['app_version'],
143 'manifest_version_code': self._APP_INFO['manifest_app_version'],
144 'update_version_code': self._APP_INFO['manifest_app_version'],
145 'ab_version': self._APP_INFO['app_version'],
c2a1bdb0 146 'resolution': '1080*2400',
bd9ff55b 147 'dpi': 420,
cb61e20c 148 'device_type': 'Pixel 7',
149 'device_brand': 'Google',
150 'language': 'en',
bd9ff55b 151 'os_api': '29',
cb61e20c 152 'os_version': '13',
153 'ac': 'wifi',
154 'is_pad': '0',
155 'current_region': 'US',
156 'app_type': 'normal',
bd9ff55b 157 'sys_region': 'US',
cb61e20c 158 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
bd9ff55b 159 'timezone_name': 'America/New_York',
cb61e20c 160 'residence': 'US',
161 'app_language': 'en',
bd9ff55b 162 'timezone_offset': '-14400',
cb61e20c 163 'host_abi': 'armeabi-v7a',
164 'locale': 'en',
165 'ac2': 'wifi5g',
166 'uoo': '1',
fc53ec13 167 'carrier_region': 'US',
cb61e20c 168 'op_region': 'US',
169 'build_number': self._APP_INFO['app_version'],
170 'region': 'US',
171 'ts': int(time.time()),
3584b839 172 'iid': self._APP_INFO.get('iid'),
173 'device_id': self._DEVICE_ID,
cb61e20c 174 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
3584b839 175 })
046cab39
M
176
177 def _call_api(self, ep, query, video_id, fatal=True,
178 note='Downloading API JSON', errnote='Unable to download API page'):
cb61e20c 179 if not self._APP_INFO and not self._get_next_app_info():
180 message = 'No working app info is available'
181 if fatal:
182 raise ExtractorError(message, expected=True)
183 else:
184 self.report_warning(message)
185 return
186
187 max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
188 for count in itertools.count(1):
189 self.write_debug(str(self._APP_INFO))
190 real_query = self._build_api_query(query)
046cab39 191 try:
cb61e20c 192 return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
046cab39
M
193 except ExtractorError as e:
194 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
cb61e20c 195 message = str(e.cause or e.msg)
196 if not self._get_next_app_info():
046cab39 197 if fatal:
cb61e20c 198 raise
046cab39 199 else:
cb61e20c 200 self.report_warning(message)
046cab39 201 return
cb61e20c 202 self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
046cab39 203 continue
cb61e20c 204 raise
0fd6661e 205
ba723997 206 def _extract_aweme_app(self, aweme_id):
207 feed_list = self._call_api(
208 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed',
209 errnote='Unable to download video feed').get('aweme_list') or []
210 aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
211 if not aweme_detail:
212 raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
213 return self._parse_aweme_video_app(aweme_detail)
214
eef1e9f4 215 def _extract_web_data_and_status(self, url, video_id, fatal=True):
ea881297 216 video_data, status = {}, -1
217
218 res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
219 if res is False:
220 return video_data, status
221
222 webpage, urlh = res
223 if urllib.parse.urlparse(urlh.url).path == '/login':
224 message = 'TikTok is requiring login for access to this content'
225 if fatal:
226 self.raise_login_required(message)
227 self.report_warning(f'{message}. {self._login_hint()}')
228 return video_data, status
eef1e9f4 229
230 if universal_data := self._get_universal_data(webpage, video_id):
231 self.write_debug('Found universal data for rehydration')
232 status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
233 video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
234
235 elif sigi_data := self._get_sigi_state(webpage, video_id):
236 self.write_debug('Found sigi state data')
237 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
238 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
239
240 elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
241 self.write_debug('Found next.js data')
242 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
243 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
244
245 elif fatal:
246 raise ExtractorError('Unable to extract webpage video data')
247
248 return video_data, status
249
347f13dd 250 def _get_subtitles(self, aweme_detail, aweme_id, user_name):
e0585e65
M
251 # TODO: Extract text positioning info
252 subtitles = {}
ba723997 253 # aweme/detail endpoint subs
e0585e65 254 captions_info = traverse_obj(
ba723997 255 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
e0585e65
M
256 for caption in captions_info:
257 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
258 if not caption_url:
259 continue
260 caption_json = self._download_json(
261 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
262 if not caption_json:
263 continue
264 subtitles.setdefault(caption.get('language', 'en'), []).append({
265 'ext': 'srt',
266 'data': '\n\n'.join(
267 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
add96eb9 268 for i, line in enumerate(caption_json['utterances']) if line.get('text')),
e0585e65 269 })
ba723997 270 # feed endpoint subs
271 if not subtitles:
272 for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
273 if not caption.get('url'):
274 continue
275 subtitles.setdefault(caption.get('lang') or 'en', []).append({
276 'ext': remove_start(caption.get('caption_format'), 'web'),
277 'url': caption['url'],
278 })
279 # webpage subs
280 if not subtitles:
347f13dd 281 if user_name: # only _parse_aweme_video_app needs to extract the webpage here
eef1e9f4 282 aweme_detail, _ = self._extract_web_data_and_status(
347f13dd 283 self._create_url(user_name, aweme_id), aweme_id, fatal=False)
eef1e9f4 284 for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
ba723997 285 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
286 'ext': remove_start(caption.get('Format'), 'web'),
287 'url': caption['Url'],
288 })
e0585e65
M
289 return subtitles
290
4ccd73fe 291 def _parse_url_key(self, url_key):
292 format_id, codec, res, bitrate = self._search_regex(
293 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
294 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
295 if not format_id:
296 return {}, None
297 return {
298 'format_id': format_id,
299 'vcodec': 'h265' if codec == 'bytevc1' else codec,
300 'tbr': int_or_none(bitrate, scale=1000) or None,
301 'quality': qualities(self.QUALITIES)(res),
302 }, res
303
943d5ab1 304 def _parse_aweme_video_app(self, aweme_detail):
0fd6661e 305 aweme_id = aweme_detail['aweme_id']
bd9ff55b 306 video_info = aweme_detail['video']
bd9ff55b
M
307 known_resolutions = {}
308
b09bd0c1 309 def audio_meta(url):
310 ext = determine_ext(url, default_ext='m4a')
8ceb07e8 311 return {
312 'format_note': 'Music track',
b09bd0c1 313 'ext': ext,
314 'acodec': 'aac' if ext == 'm4a' else ext,
8ceb07e8 315 'vcodec': 'none',
316 'width': None,
317 'height': None,
b09bd0c1 318 } if ext == 'mp3' or '-music-' in url else {}
8ceb07e8 319
bd9ff55b 320 def extract_addr(addr, add_meta={}):
4ccd73fe 321 parsed_meta, res = self._parse_url_key(addr.get('url_key', ''))
63f685f3 322 is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
bd9ff55b 323 if res:
9ff94664 324 known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
325 known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
bd9ff55b
M
326 parsed_meta.update(known_resolutions.get(res, {}))
327 add_meta.setdefault('height', int_or_none(res[:-1]))
328 return [{
329 'url': url,
330 'filesize': int_or_none(addr.get('data_size')),
331 'ext': 'mp4',
332 'acodec': 'aac',
0fd6661e
M
333 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
334 **add_meta, **parsed_meta,
4ccd73fe 335 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
63f685f3 336 'preference': -100 if is_bytevc2 else -1,
34921b43 337 'format_note': join_nonempty(
63f685f3 338 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
339 '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
b09bd0c1 340 **audio_meta(url),
bd9ff55b
M
341 } for url in addr.get('url_list') or []]
342
343 # Hack: Add direct video links first to prioritize them when removing duplicate formats
344 formats = []
9ff94664 345 width = int_or_none(video_info.get('width'))
346 height = int_or_none(video_info.get('height'))
4ccd73fe 347 ratio = try_call(lambda: width / height) or 0.5625
bd9ff55b
M
348 if video_info.get('play_addr'):
349 formats.extend(extract_addr(video_info['play_addr'], {
350 'format_id': 'play_addr',
351 'format_note': 'Direct video',
352 'vcodec': 'h265' if traverse_obj(
be1f331f 353 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
9ff94664 354 'width': width,
355 'height': height,
bd9ff55b
M
356 }))
357 if video_info.get('download_addr'):
9ff94664 358 download_addr = video_info['download_addr']
359 dl_width = int_or_none(download_addr.get('width'))
360 formats.extend(extract_addr(download_addr, {
bd9ff55b
M
361 'format_id': 'download_addr',
362 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
363 'vcodec': 'h264',
4ccd73fe 364 'width': dl_width,
365 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong
0fd6661e 366 'preference': -2 if video_info.get('has_watermark') else -1,
bd9ff55b
M
367 }))
368 if video_info.get('play_addr_h264'):
369 formats.extend(extract_addr(video_info['play_addr_h264'], {
370 'format_id': 'play_addr_h264',
371 'format_note': 'Direct video',
372 'vcodec': 'h264',
373 }))
374 if video_info.get('play_addr_bytevc1'):
375 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
376 'format_id': 'play_addr_bytevc1',
377 'format_note': 'Direct video',
378 'vcodec': 'h265',
379 }))
380
381 for bitrate in video_info.get('bit_rate', []):
382 if bitrate.get('play_addr'):
383 formats.extend(extract_addr(bitrate['play_addr'], {
384 'format_id': bitrate.get('gear_name'),
385 'format_note': 'Playback video',
386 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
387 'vcodec': 'h265' if traverse_obj(
388 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
943d5ab1 389 'fps': bitrate.get('FPS'),
bd9ff55b
M
390 }))
391
392 self._remove_duplicate_formats(formats)
6134fbeb
M
393 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
394 if auth_cookie:
395 for f in formats:
add96eb9 396 self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
bd9ff55b
M
397
398 thumbnails = []
399 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
400 'origin_cover', 'dynamic_cover'):
92593690 401 for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
402 thumbnails.append({
403 'id': cover_id,
404 'url': cover_url,
405 })
406
407 stats_info = aweme_detail.get('statistics') or {}
92593690 408 music_info = aweme_detail.get('music') or {}
6839ae1f 409 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
bd9ff55b
M
410
411 contained_music_track = traverse_obj(
412 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
413 contained_music_author = traverse_obj(
414 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
415
add96eb9 416 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle'))
bd9ff55b
M
417 if is_generic_og_trackname:
418 music_track, music_author = contained_music_track or 'original sound', contained_music_author
419 else:
f4f9f6d0 420 music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str}))
bd9ff55b 421
347f13dd 422 author_info = traverse_obj(aweme_detail, ('author', {
423 'uploader': ('unique_id', {str}),
424 'uploader_id': ('uid', {str_or_none}),
425 'channel': ('nickname', {str}),
426 'channel_id': ('sec_uid', {str}),
427 }))
428
bd9ff55b
M
429 return {
430 'id': aweme_id,
92593690 431 **traverse_obj(aweme_detail, {
432 'title': ('desc', {str}),
433 'description': ('desc', {str}),
434 'timestamp': ('create_time', {int_or_none}),
435 }),
436 **traverse_obj(stats_info, {
437 'view_count': 'play_count',
438 'like_count': 'digg_count',
439 'repost_count': 'share_count',
440 'comment_count': 'comment_count',
441 }, expected_type=int_or_none),
347f13dd 442 **author_info,
443 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
444 'uploader_url': format_field(
445 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
bd9ff55b
M
446 'track': music_track,
447 'album': str_or_none(music_info.get('album')) or None,
f4f9f6d0 448 'artists': re.split(r'(?:, | & )', music_author) if music_author else None,
bd9ff55b 449 'formats': formats,
347f13dd 450 'subtitles': self.extract_subtitles(
451 aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')),
bd9ff55b 452 'thumbnails': thumbnails,
347f13dd 453 'duration': (traverse_obj(video_info, (
454 (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any))
455 or traverse_obj(music_info, ('duration', {int_or_none}))),
53dad39e
M
456 'availability': self._availability(
457 is_private='Private' in labels,
458 needs_subscription='Friends only' in labels,
9f14daf2 459 is_unlisted='Followers only' in labels),
460 '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
bd9ff55b
M
461 }
462
347f13dd 463 def _extract_web_formats(self, aweme_detail):
4ccd73fe 464 COMMON_FORMAT_INFO = {
465 'ext': 'mp4',
466 'vcodec': 'h264',
467 'acodec': 'aac',
468 }
347f13dd 469 video_info = traverse_obj(aweme_detail, ('video', {dict})) or {}
470 play_width = int_or_none(video_info.get('width'))
471 play_height = int_or_none(video_info.get('height'))
472 ratio = try_call(lambda: play_width / play_height) or 0.5625
473 formats = []
4ccd73fe 474
475 for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])):
476 format_info, res = self._parse_url_key(
477 traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '')
478 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
479 is_bytevc2 = format_info.get('vcodec') == 'bytevc2'
480 format_info.update({
481 'format_note': 'UNPLAYABLE' if is_bytevc2 else None,
482 'preference': -100 if is_bytevc2 else -1,
483 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})),
484 })
485
486 if dimension := (res and int(res[:-1])):
487 if dimension == 540: # '540p' is actually 576p
488 dimension = 576
489 if ratio < 1: # portrait: res/dimension is width
490 y = int(dimension / ratio)
491 format_info.update({
492 'width': dimension,
493 'height': y - (y % 2),
494 })
495 else: # landscape: res/dimension is height
496 x = int(dimension * ratio)
497 format_info.update({
347f13dd 498 'width': x + (x % 2),
4ccd73fe 499 'height': dimension,
500 })
501
502 for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})):
503 formats.append({
504 **COMMON_FORMAT_INFO,
505 **format_info,
506 'url': self._proto_relative_url(video_url),
507 })
508
509 # We don't have res string for play formats, but need quality for sorting & de-duplication
347f13dd 510 play_quality = traverse_obj(formats, (lambda _, v: v['width'] == play_width, 'quality', any))
92593690 511
512 for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
513 formats.append({
4ccd73fe 514 **COMMON_FORMAT_INFO,
515 'format_id': 'play',
943d5ab1 516 'url': self._proto_relative_url(play_url),
347f13dd 517 'width': play_width,
518 'height': play_height,
4ccd73fe 519 'quality': play_quality,
92593690 520 })
943d5ab1 521
92593690 522 for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
943d5ab1 523 formats.append({
4ccd73fe 524 **COMMON_FORMAT_INFO,
943d5ab1
M
525 'format_id': 'download',
526 'url': self._proto_relative_url(download_url),
943d5ab1 527 })
92593690 528
943d5ab1 529 self._remove_duplicate_formats(formats)
943d5ab1 530
4ccd73fe 531 for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']):
532 f.update({
533 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '),
534 'preference': f.get('preference') or -2,
535 })
536
537 # Is it a slideshow with only audio for download?
347f13dd 538 if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})):
539 audio_url = aweme_detail['music']['playUrl']
4ccd73fe 540 ext = traverse_obj(parse_qs(audio_url), (
541 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a'
542 formats.append({
543 'format_id': 'audio',
544 'url': self._proto_relative_url(audio_url),
545 'ext': ext,
546 'acodec': 'aac' if ext == 'm4a' else ext,
547 'vcodec': 'none',
548 })
549
347f13dd 550 return formats
551
552 def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False):
553 author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), {
554 'channel': ('nickname', {str}),
555 'channel_id': (('authorSecId', 'secUid'), {str}),
556 'uploader': (('uniqueId', 'author'), {str}),
557 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
558 }), get_all=False)
943d5ab1
M
559
560 return {
92593690 561 'id': video_id,
347f13dd 562 'formats': None if extract_flat else self._extract_web_formats(aweme_detail),
563 'subtitles': None if extract_flat else self.extract_subtitles(aweme_detail, video_id, None),
564 'http_headers': {'Referer': webpage_url},
565 **author_info,
566 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
567 'uploader_url': format_field(
568 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
569 **traverse_obj(aweme_detail, ('music', {
4ccd73fe 570 'track': ('title', {str}),
571 'album': ('album', {str}, {lambda x: x or None}),
347f13dd 572 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}),
4ccd73fe 573 'duration': ('duration', {int_or_none}),
347f13dd 574 })),
92593690 575 **traverse_obj(aweme_detail, {
576 'title': ('desc', {str}),
577 'description': ('desc', {str}),
4ccd73fe 578 # audio-only slideshows have a video duration of 0 and an actual audio duration
579 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}),
92593690 580 'timestamp': ('createTime', {int_or_none}),
581 }),
347f13dd 582 **traverse_obj(aweme_detail, ('stats', {
92593690 583 'view_count': 'playCount',
584 'like_count': 'diggCount',
585 'repost_count': 'shareCount',
586 'comment_count': 'commentCount',
347f13dd 587 }), expected_type=int_or_none),
588 'thumbnails': traverse_obj(aweme_detail, (
589 (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {
590 'url': ({url_or_none}, {self._proto_relative_url}),
591 },
592 )),
943d5ab1
M
593 }
594
0fd6661e
M
595
596class TikTokIE(TikTokBaseIE):
c4cbd3be 597 _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
bfd973ec 598 _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
0fd6661e
M
599
600 _TESTS = [{
601 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
0481e266 602 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
0fd6661e
M
603 'info_dict': {
604 'id': '6748451240264420610',
605 'ext': 'mp4',
606 'title': '#jassmanak #lehanga #leenabhushan',
607 'description': '#jassmanak #lehanga #leenabhushan',
608 'duration': 13,
0481e266 609 'height': 1024,
610 'width': 576,
0fd6661e
M
611 'uploader': 'leenabhushan',
612 'uploader_id': '6691488002098119685',
0481e266 613 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
0fd6661e
M
614 'creator': 'facestoriesbyleenabh',
615 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
616 'upload_date': '20191016',
617 'timestamp': 1571246252,
618 'view_count': int,
619 'like_count': int,
620 'repost_count': int,
621 'comment_count': int,
a44ca5a4 622 'artist': 'Ysrbeats',
623 'album': 'Lehanga',
624 'track': 'Lehanga',
92593690 625 },
626 'skip': '404 Not Found',
0fd6661e
M
627 }, {
628 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
347f13dd 629 'md5': 'f21112672ee4ce05ca390fb6522e1b6f',
0fd6661e
M
630 'info_dict': {
631 'id': '6742501081818877190',
632 'ext': 'mp4',
633 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
634 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
635 'duration': 27,
347f13dd 636 'height': 1024,
637 'width': 576,
0fd6661e
M
638 'uploader': 'patrox',
639 'uploader_id': '18702747',
347f13dd 640 'uploader_url': 'https://www.tiktok.com/@patrox',
641 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
92593690 642 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
f4f9f6d0 643 'channel': 'patroX',
0fd6661e
M
644 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
645 'upload_date': '20190930',
646 'timestamp': 1569860870,
647 'view_count': int,
648 'like_count': int,
649 'repost_count': int,
650 'comment_count': int,
f4f9f6d0 651 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'],
a44ca5a4 652 'track': 'Big Fun',
92593690 653 },
0fd6661e 654 }, {
347f13dd 655 # Banned audio, was available on the app, now works with web too
96f13f01
M
656 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
657 'info_dict': {
658 'id': '6984138651336838402',
659 'ext': 'mp4',
660 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
661 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
662 'uploader': 'barudakhb_',
f4f9f6d0 663 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
96f13f01 664 'uploader_id': '6974687867511718913',
347f13dd 665 'uploader_url': 'https://www.tiktok.com/@barudakhb_',
666 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
92593690 667 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
96f13f01 668 'track': 'Boka Dance',
f4f9f6d0 669 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'],
96f13f01
M
670 'timestamp': 1626121503,
671 'duration': 18,
672 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
673 'upload_date': '20210712',
674 'view_count': int,
675 'like_count': int,
676 'repost_count': int,
677 'comment_count': int,
92593690 678 },
96f13f01
M
679 }, {
680 # Sponsored video, only available with feed workaround
681 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
682 'info_dict': {
683 'id': '7042692929109986561',
684 'ext': 'mp4',
685 'title': 'Slap and Run!',
686 'description': 'Slap and Run!',
687 'uploader': 'user440922249',
f4f9f6d0 688 'channel': 'Slap And Run',
96f13f01
M
689 'uploader_id': '7036055384943690754',
690 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
92593690 691 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
96f13f01
M
692 'track': 'Promoted Music',
693 'timestamp': 1639754738,
694 'duration': 30,
695 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
696 'upload_date': '20211217',
697 'view_count': int,
698 'like_count': int,
699 'repost_count': int,
700 'comment_count': int,
701 },
347f13dd 702 'skip': 'This video is unavailable',
5fa3c9a8
HTL
703 }, {
704 # Video without title and description
705 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
706 'info_dict': {
707 'id': '7059698374567611694',
708 'ext': 'mp4',
b801cd71 709 'title': 'TikTok video #7059698374567611694',
5fa3c9a8
HTL
710 'description': '',
711 'uploader': 'pokemonlife22',
f4f9f6d0 712 'channel': 'Pokemon',
5fa3c9a8 713 'uploader_id': '6820838815978423302',
347f13dd 714 'uploader_url': 'https://www.tiktok.com/@pokemonlife22',
715 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
92593690 716 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8
HTL
717 'track': 'original sound',
718 'timestamp': 1643714123,
719 'duration': 6,
720 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
721 'upload_date': '20220201',
f4f9f6d0 722 'artists': ['Pokemon'],
5fa3c9a8
HTL
723 'view_count': int,
724 'like_count': int,
725 'repost_count': int,
726 'comment_count': int,
727 },
a39a7ba8 728 }, {
729 # hydration JSON is sent in a <script> element
730 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
731 'info_dict': {
732 'id': '7065799023130643713',
733 'ext': 'mp4',
734 'title': '#denidil#денидил',
735 'description': '#denidil#денидил',
736 'uploader': 'denidil6',
737 'uploader_id': '7046664115636405250',
738 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
739 'artist': 'Holocron Music',
740 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
741 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
742 'timestamp': 1645134536,
743 'duration': 26,
744 'upload_date': '20220217',
745 'view_count': int,
746 'like_count': int,
747 'repost_count': int,
748 'comment_count': int,
749 },
f7c5a5e9 750 'skip': 'This video is unavailable',
8ceb07e8 751 }, {
752 # slideshow audio-only mp3 format
753 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
754 'info_dict': {
755 'id': '7139980461132074283',
756 'ext': 'mp3',
757 'title': 'TikTok video #7139980461132074283',
758 'description': '',
f4f9f6d0 759 'channel': 'Antaura',
8ceb07e8 760 'uploader': '_le_cannibale_',
761 'uploader_id': '6604511138619654149',
347f13dd 762 'uploader_url': 'https://www.tiktok.com/@_le_cannibale_',
763 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
92593690 764 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
f4f9f6d0 765 'artists': ['nathan !'],
8ceb07e8 766 'track': 'grahamscott canon',
347f13dd 767 'duration': 10,
8ceb07e8 768 'upload_date': '20220905',
769 'timestamp': 1662406249,
770 'view_count': int,
771 'like_count': int,
772 'repost_count': int,
773 'comment_count': int,
f4f9f6d0 774 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
8ceb07e8 775 },
92593690 776 }, {
777 # only available via web
347f13dd 778 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
779 'md5': '4cdefa501ac8ac20bf04986e10916fea',
92593690 780 'info_dict': {
781 'id': '7206382937372134662',
782 'ext': 'mp4',
783 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
784 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
f4f9f6d0 785 'channel': 'MoxyPatch',
92593690 786 'uploader': 'moxypatch',
787 'uploader_id': '7039142049363379205',
347f13dd 788 'uploader_url': 'https://www.tiktok.com/@moxypatch',
789 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
92593690 790 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
d9b4154c 791 'artists': ['your worst nightmare'],
92593690 792 'track': 'original sound',
793 'upload_date': '20230303',
794 'timestamp': 1677866781,
795 'duration': 10,
796 'view_count': int,
797 'like_count': int,
798 'repost_count': int,
799 'comment_count': int,
800 'thumbnail': r're:^https://.+',
801 'thumbnails': 'count:3',
802 },
803 'expected_warnings': ['Unable to find video in feed'],
c2a1bdb0 804 }, {
805 # 1080p format
add96eb9 806 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio
c2a1bdb0 807 'md5': '982512017a8a917124d5a08c8ae79621',
808 'info_dict': {
809 'id': '7107337212743830830',
810 'ext': 'mp4',
811 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
812 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
813 'uploader': 'tatemcrae',
814 'uploader_id': '86328792343818240',
815 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
816 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
f4f9f6d0 817 'channel': 'tate mcrae',
f4f9f6d0 818 'artists': ['tate mcrae'],
c2a1bdb0 819 'track': 'original sound',
820 'upload_date': '20220609',
821 'timestamp': 1654805899,
822 'duration': 150,
823 'view_count': int,
824 'like_count': int,
825 'repost_count': int,
826 'comment_count': int,
827 'thumbnail': r're:^https://.+\.webp',
828 },
347f13dd 829 'skip': 'Unavailable via feed API, only audio available via web',
b09bd0c1 830 }, {
831 # Slideshow, audio-only m4a format
832 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
833 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
834 'info_dict': {
835 'id': '7253412088251534594',
836 'ext': 'm4a',
837 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
838 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
839 'uploader': 'hara_yoimiya',
840 'uploader_id': '6582536342634676230',
347f13dd 841 'uploader_url': 'https://www.tiktok.com/@hara_yoimiya',
842 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
b09bd0c1 843 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
347f13dd 844 'channel': 'лампочка(!)',
f4f9f6d0 845 'artists': ['Øneheart'],
b09bd0c1 846 'album': 'watching the stars',
847 'track': 'watching the stars',
347f13dd 848 'duration': 60,
b09bd0c1 849 'upload_date': '20230708',
850 'timestamp': 1688816612,
851 'view_count': int,
852 'like_count': int,
853 'comment_count': int,
854 'repost_count': int,
f4f9f6d0 855 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
b09bd0c1 856 },
e0585e65
M
857 }, {
858 # Auto-captions available
859 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
add96eb9 860 'only_matching': True,
0fd6661e
M
861 }]
862
ce18a19b 863 def _real_extract(self, url):
b801cd71 864 video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
41ba4a80 865
866 if self._KNOWN_APP_INFO:
867 try:
868 return self._extract_aweme_app(video_id)
869 except ExtractorError as e:
870 e.expected = True
871 self.report_warning(f'{e}; trying with webpage')
bd9ff55b 872
a39a7ba8 873 url = self._create_url(user_id, video_id)
eef1e9f4 874 video_data, status = self._extract_web_data_and_status(url, video_id)
11aa91a1 875
d9b4154c 876 if video_data and status == 0:
92593690 877 return self._parse_aweme_video_web(video_data, url, video_id)
1418a043 878 elif status == 10216:
879 raise ExtractorError('This video is private', expected=True)
d9b4154c 880 raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
f7f18f90
A
881
882
0fd6661e 883class TikTokUserIE(TikTokBaseIE):
f7f18f90 884 IE_NAME = 'tiktok:user'
347f13dd 885 _VALID_URL = r'(?:tiktokuser:|https?://(?:www\.)?tiktok\.com/@)(?P<id>[\w.-]+)/?(?:$|[#?])'
f7f18f90 886 _TESTS = [{
526d74ec 887 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
888 'playlist_mincount': 45,
889 'info_dict': {
347f13dd 890 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
0481e266 891 'title': 'corgibobaa',
f7f18f90 892 },
5fa3c9a8
HTL
893 }, {
894 'url': 'https://www.tiktok.com/@6820838815978423302',
895 'playlist_mincount': 5,
896 'info_dict': {
347f13dd 897 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8 898 'title': '6820838815978423302',
5fa3c9a8 899 },
f7f18f90
A
900 }, {
901 'url': 'https://www.tiktok.com/@meme',
902 'playlist_mincount': 593,
903 'info_dict': {
347f13dd 904 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
0481e266 905 'title': 'meme',
f7f18f90 906 },
347f13dd 907 }, {
908 'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
909 'playlist_mincount': 31,
910 'info_dict': {
911 'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
912 },
f7f18f90 913 }]
347f13dd 914 _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
915 _API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
f7f18f90 916
347f13dd 917 def _build_web_query(self, sec_uid, cursor):
918 return {
919 'aid': '1988',
920 'app_language': 'en',
921 'app_name': 'tiktok_web',
922 'browser_language': 'en-US',
923 'browser_name': 'Mozilla',
924 'browser_online': 'true',
925 'browser_platform': 'Win32',
926 'browser_version': '5.0 (Windows)',
927 'channel': 'tiktok_web',
928 'cookie_enabled': 'true',
929 'count': '15',
930 'cursor': cursor,
931 'device_id': self._DEVICE_ID,
932 'device_platform': 'web_pc',
933 'focus_state': 'true',
934 'from_page': 'user',
935 'history_len': '2',
936 'is_fullscreen': 'false',
937 'is_page_visible': 'true',
938 'language': 'en',
939 'os': 'windows',
940 'priority_region': '',
941 'referer': '',
942 'region': 'US',
943 'screen_height': '1080',
944 'screen_width': '1920',
945 'secUid': sec_uid,
946 'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
947 'tz_name': 'UTC',
948 'verifyFp': f'verify_{"".join(random.choices(string.hexdigits, k=7))}',
949 'webcast_language': 'en',
0fd6661e
M
950 }
951
347f13dd 952 def _entries(self, sec_uid, user_name):
953 display_id = user_name or sec_uid
c53c2e40 954 seen_ids = set()
347f13dd 955
956 cursor = int(time.time() * 1E3)
0fd6661e 957 for page in itertools.count(1):
347f13dd 958 response = self._download_json(
959 self._API_BASE_URL, display_id, f'Downloading page {page}',
960 query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
b3187433 961
347f13dd 962 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
963 video_id = video['id']
c53c2e40 964 if video_id in seen_ids:
965 continue
966 seen_ids.add(video_id)
347f13dd 967 webpage_url = self._create_url(display_id, video_id)
968 yield self.url_result(
969 webpage_url, TikTokIE,
970 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
971
972 old_cursor = cursor
973 cursor = traverse_obj(
c53c2e40 974 response, ('itemList', -1, 'createTime', {lambda x: int(x * 1E3)}))
975 if not cursor or old_cursor == cursor:
347f13dd 976 # User may not have posted within this ~1 week lookback, so manually adjust cursor
977 cursor = old_cursor - 7 * 86_400_000
978 # In case 'hasMorePrevious' is wrong, break if we have gone back before TikTok existed
979 if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
980 break
b3187433 981
347f13dd 982 def _get_sec_uid(self, user_url, user_name, msg):
983 webpage = self._download_webpage(
984 user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
985 note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
986 return (traverse_obj(self._get_universal_data(webpage, user_name),
987 ('webapp.user-detail', 'userInfo', 'user', 'secUid', {str}))
988 or traverse_obj(self._get_sigi_state(webpage, user_name),
989 ('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid', {str}),
990 ('UserModule', 'users', ..., 'secUid', {str}, any)))
b3187433 991
347f13dd 992 def _real_extract(self, url):
993 user_name, sec_uid = self._match_id(url), None
994 if mobj := re.fullmatch(r'MS4wLjABAAAA[\w-]{64}', user_name):
995 user_name, sec_uid = None, mobj.group(0)
996 else:
997 sec_uid = (self._get_sec_uid(self._UPLOADER_URL_FORMAT % user_name, user_name, 'user')
998 or self._get_sec_uid(self._UPLOADER_URL_FORMAT % f'{user_name}/live', user_name, 'live'))
999
1000 if not sec_uid:
1001 webpage = self._download_webpage(
1002 f'https://www.tiktok.com/embed/@{user_name}', user_name,
1003 note='Downloading user embed page', fatal=False) or ''
1004 data = traverse_obj(self._search_json(
1005 r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
1006 webpage, 'data', user_name, default={}),
1007 ('source', 'data', f'/embed/@{user_name}', {dict}))
1008
1009 for aweme_id in traverse_obj(data, ('videoList', ..., 'id', {str})):
1010 webpage_url = self._create_url(user_name, aweme_id)
1011 video_data, _ = self._extract_web_data_and_status(webpage_url, aweme_id, fatal=False)
1012 sec_uid = self._parse_aweme_video_web(
1013 video_data, webpage_url, aweme_id, extract_flat=True).get('channel_id')
1014 if sec_uid:
1015 break
1016
1017 if not sec_uid:
1018 raise ExtractorError(
1019 'Unable to extract secondary user ID. If you are able to get the channel_id '
1020 'from a video posted by this user, try using "tiktokuser:channel_id" as the '
1021 'input URL (replacing `channel_id` with its actual value)', expected=True)
1022
1023 return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
943d5ab1
M
1024
1025
6368e2e6 1026class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
8126298c
M
1027 def _entries(self, list_id, display_id):
1028 query = {
1029 self._QUERY_NAME: list_id,
1030 'cursor': 0,
1031 'count': 20,
1032 'type': 5,
3584b839 1033 'device_id': self._DEVICE_ID,
8126298c
M
1034 }
1035
8126298c 1036 for page in itertools.count(1):
be5c1ae8 1037 for retry in self.RetryManager():
8126298c 1038 try:
be5c1ae8 1039 post_list = self._call_api(
1040 self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}',
1041 errnote='Unable to download video list')
8126298c 1042 except ExtractorError as e:
be5c1ae8 1043 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
1044 retry.error = e
8126298c
M
1045 continue
1046 raise
8126298c
M
1047 for video in post_list.get('aweme_list', []):
1048 yield {
1049 **self._parse_aweme_video_app(video),
0b77924a 1050 'extractor_key': TikTokIE.ie_key(),
8126298c
M
1051 'extractor': 'TikTok',
1052 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
1053 }
1054 if not post_list.get('has_more'):
1055 break
1056 query['cursor'] = post_list['cursor']
1057
1058 def _real_extract(self, url):
1059 list_id = self._match_id(url)
1060 return self.playlist_result(self._entries(list_id, list_id), list_id)
1061
1062
1063class TikTokSoundIE(TikTokBaseListIE):
1064 IE_NAME = 'tiktok:sound'
1065 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1066 _WORKING = False
8126298c
M
1067 _QUERY_NAME = 'music_id'
1068 _API_ENDPOINT = 'music/aweme'
1069 _TESTS = [{
1070 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
1071 'playlist_mincount': 100,
1072 'info_dict': {
add96eb9 1073 'id': '6956990112127585029',
8126298c 1074 },
add96eb9 1075 'expected_warnings': ['Retrying'],
8126298c
M
1076 }, {
1077 # Actual entries are less than listed video count
1078 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
1079 'playlist_mincount': 2182,
1080 'info_dict': {
add96eb9 1081 'id': '7036843036118469381',
8126298c 1082 },
add96eb9 1083 'expected_warnings': ['Retrying'],
8126298c
M
1084 }]
1085
1086
1087class TikTokEffectIE(TikTokBaseListIE):
1088 IE_NAME = 'tiktok:effect'
1089 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1090 _WORKING = False
8126298c
M
1091 _QUERY_NAME = 'sticker_id'
1092 _API_ENDPOINT = 'sticker/aweme'
1093 _TESTS = [{
1094 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
1095 'playlist_mincount': 100,
1096 'info_dict': {
1097 'id': '1258156',
1098 },
add96eb9 1099 'expected_warnings': ['Retrying'],
8126298c
M
1100 }, {
1101 # Different entries between mobile and web, depending on region
1102 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
add96eb9 1103 'only_matching': True,
8126298c
M
1104 }]
1105
1106
1107class TikTokTagIE(TikTokBaseListIE):
1108 IE_NAME = 'tiktok:tag'
1109 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
f7c5a5e9 1110 _WORKING = False
8126298c
M
1111 _QUERY_NAME = 'ch_id'
1112 _API_ENDPOINT = 'challenge/aweme'
1113 _TESTS = [{
1114 'url': 'https://tiktok.com/tag/hello2018',
1115 'playlist_mincount': 39,
1116 'info_dict': {
1117 'id': '46294678',
1118 'title': 'hello2018',
1119 },
add96eb9 1120 'expected_warnings': ['Retrying'],
8126298c
M
1121 }, {
1122 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
add96eb9 1123 'only_matching': True,
8126298c
M
1124 }]
1125
1126 def _real_extract(self, url):
1127 display_id = self._match_id(url)
1128 webpage = self._download_webpage(url, display_id, headers={
add96eb9 1129 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
8126298c
M
1130 })
1131 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
1132 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
1133
1134
119d41f2 1135class TikTokCollectionIE(TikTokBaseIE):
1136 IE_NAME = 'tiktok:collection'
1137 _VALID_URL = r'https?://www\.tiktok\.com/@(?P<user_id>[\w.-]+)/collection/(?P<title>[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)'
1138 _TESTS = [{
1139 # playlist should have exactly 9 videos
1140 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462',
1141 'info_dict': {
1142 'id': '7371330159376370462',
add96eb9 1143 'title': 'imanoreotwe-count-test',
119d41f2 1144 },
add96eb9 1145 'playlist_count': 9,
119d41f2 1146 }, {
1147 # tests returning multiple pages of a large collection
1148 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875',
1149 'info_dict': {
1150 'id': '7111887189571160875',
add96eb9 1151 'title': 'imanoreotwe-%F0%9F%98%82',
119d41f2 1152 },
add96eb9 1153 'playlist_mincount': 100,
119d41f2 1154 }]
1155 _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/'
1156 _PAGE_COUNT = 30
1157
1158 def _build_web_query(self, collection_id, cursor):
1159 return {
1160 'aid': '1988',
1161 'collectionId': collection_id,
1162 'count': self._PAGE_COUNT,
1163 'cursor': cursor,
1164 'sourceType': '113',
1165 }
1166
1167 def _entries(self, collection_id):
1168 cursor = 0
1169 for page in itertools.count(1):
1170 response = self._download_json(
1171 self._API_BASE_URL, collection_id, f'Downloading page {page}',
1172 query=self._build_web_query(collection_id, cursor))
1173
1174 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
1175 video_id = video['id']
1176 author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_'
1177 webpage_url = self._create_url(author, video_id)
1178 yield self.url_result(
1179 webpage_url, TikTokIE,
1180 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
1181
1182 if not traverse_obj(response, 'hasMore'):
1183 break
1184 cursor += self._PAGE_COUNT
1185
1186 def _real_extract(self, url):
1187 collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id')
1188
1189 return self.playlist_result(
1190 self._entries(collection_id), collection_id, '-'.join((user_name, title)))
1191
1192
ba723997 1193class DouyinIE(TikTokBaseIE):
943d5ab1
M
1194 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
1195 _TESTS = [{
1196 'url': 'https://www.douyin.com/video/6961737553342991651',
9ff94664 1197 'md5': '9ecce7bc5b302601018ecb2871c63a75',
943d5ab1
M
1198 'info_dict': {
1199 'id': '6961737553342991651',
1200 'ext': 'mp4',
1201 'title': '#杨超越 小小水手带你去远航❤️',
ba723997 1202 'description': '#杨超越 小小水手带你去远航❤️',
9ff94664 1203 'uploader': '6897520xka',
943d5ab1 1204 'uploader_id': '110403406559',
ba723997 1205 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1206 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1207 'channel': '杨超越',
9ff94664 1208 'duration': 19,
ba723997 1209 'timestamp': 1620905839,
1210 'upload_date': '20210513',
1211 'track': '@杨超越创作的原声',
9ff94664 1212 'artists': ['杨超越'],
943d5ab1
M
1213 'view_count': int,
1214 'like_count': int,
1215 'repost_count': int,
1216 'comment_count': int,
92593690 1217 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1218 },
943d5ab1
M
1219 }, {
1220 'url': 'https://www.douyin.com/video/6982497745948921092',
9ff94664 1221 'md5': '15c5e660b7048af3707304e3cc02bbb5',
943d5ab1
M
1222 'info_dict': {
1223 'id': '6982497745948921092',
1224 'ext': 'mp4',
1225 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
ba723997 1226 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
9ff94664 1227 'uploader': '0731chaoyue',
943d5ab1 1228 'uploader_id': '408654318141572',
ba723997 1229 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
92593690 1230 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
f4f9f6d0 1231 'channel': '杨超越工作室',
9ff94664 1232 'duration': 42,
ba723997 1233 'timestamp': 1625739481,
1234 'upload_date': '20210708',
1235 'track': '@杨超越工作室创作的原声',
9ff94664 1236 'artists': ['杨超越工作室'],
943d5ab1
M
1237 'view_count': int,
1238 'like_count': int,
1239 'repost_count': int,
1240 'comment_count': int,
92593690 1241 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1242 },
943d5ab1
M
1243 }, {
1244 'url': 'https://www.douyin.com/video/6953975910773099811',
9ff94664 1245 'md5': '0e6443758b8355db9a3c34864a4276be',
943d5ab1
M
1246 'info_dict': {
1247 'id': '6953975910773099811',
1248 'ext': 'mp4',
1249 'title': '#一起看海 出现在你的夏日里',
ba723997 1250 'description': '#一起看海 出现在你的夏日里',
9ff94664 1251 'uploader': '6897520xka',
943d5ab1 1252 'uploader_id': '110403406559',
ba723997 1253 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1254 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1255 'channel': '杨超越',
9ff94664 1256 'duration': 17,
ba723997 1257 'timestamp': 1619098692,
1258 'upload_date': '20210422',
1259 'track': '@杨超越创作的原声',
9ff94664 1260 'artists': ['杨超越'],
943d5ab1
M
1261 'view_count': int,
1262 'like_count': int,
1263 'repost_count': int,
1264 'comment_count': int,
92593690 1265 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1266 },
943d5ab1
M
1267 }, {
1268 'url': 'https://www.douyin.com/video/6950251282489675042',
1269 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
1270 'info_dict': {
1271 'id': '6950251282489675042',
1272 'ext': 'mp4',
1273 'title': '哈哈哈,成功了哈哈哈哈哈哈',
1274 'uploader': '杨超越',
1275 'upload_date': '20210412',
1276 'timestamp': 1618231483,
1277 'uploader_id': '110403406559',
1278 'view_count': int,
1279 'like_count': int,
1280 'repost_count': int,
1281 'comment_count': int,
ba723997 1282 },
1283 'skip': 'No longer available',
943d5ab1
M
1284 }, {
1285 'url': 'https://www.douyin.com/video/6963263655114722595',
9ff94664 1286 'md5': '1440bcf59d8700f8e014da073a4dfea8',
943d5ab1
M
1287 'info_dict': {
1288 'id': '6963263655114722595',
1289 'ext': 'mp4',
1290 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
ba723997 1291 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
9ff94664 1292 'uploader': '6897520xka',
943d5ab1 1293 'uploader_id': '110403406559',
ba723997 1294 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1295 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1296 'channel': '杨超越',
9ff94664 1297 'duration': 15,
ba723997 1298 'timestamp': 1621261163,
1299 'upload_date': '20210517',
1300 'track': '@杨超越创作的原声',
9ff94664 1301 'artists': ['杨超越'],
943d5ab1
M
1302 'view_count': int,
1303 'like_count': int,
1304 'repost_count': int,
1305 'comment_count': int,
92593690 1306 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1307 },
943d5ab1 1308 }]
943d5ab1 1309 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
53dad39e 1310 _WEBPAGE_HOST = 'https://www.douyin.com/'
943d5ab1
M
1311
1312 def _real_extract(self, url):
1313 video_id = self._match_id(url)
1314
9ff94664 1315 detail = traverse_obj(self._download_json(
1316 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
1317 'Downloading web detail JSON', 'Failed to download web detail JSON',
1318 query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
1319 if not detail:
943d5ab1 1320 # TODO: Run verification challenge code to generate signature cookies
ba723997 1321 raise ExtractorError(
9ff94664 1322 'Fresh cookies (not necessarily logged in) are needed',
1323 expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
943d5ab1 1324
9ff94664 1325 return self._parse_aweme_video_app(detail)
88afe056 1326
1327
49895f06 1328class TikTokVMIE(InfoExtractor):
ba723997 1329 _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
88afe056 1330 IE_NAME = 'vm.tiktok'
1331
49895f06 1332 _TESTS = [{
ba723997 1333 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
49895f06 1334 'info_dict': {
ba723997 1335 'id': '7170520270497680683',
49895f06 1336 'ext': 'mp4',
ba723997 1337 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
1338 'uploader_id': '6687535061741700102',
1339 'upload_date': '20221127',
49895f06 1340 'view_count': int,
ba723997 1341 'like_count': int,
49895f06 1342 'comment_count': int,
ba723997 1343 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
1344 'album': 'Wave of Mutilation: Best of Pixies',
1345 'thumbnail': r're:https://.+\.webp.*',
1346 'duration': 5,
1347 'timestamp': 1669516858,
49895f06 1348 'repost_count': int,
ba723997 1349 'artist': 'Pixies',
1350 'track': 'Where Is My Mind?',
1351 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
1352 'uploader': 'sigmachaddeus',
1353 'creator': 'SigmaChad',
1354 },
1355 }, {
c4cbd3be 1356 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
1357 'info_dict': {
1358 'id': '7106798200794926362',
1359 'ext': 'mp4',
1360 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
1361 'uploader_id': '6997695878846268418',
1362 'upload_date': '20220608',
1363 'view_count': int,
1364 'like_count': int,
1365 'comment_count': int,
1366 'thumbnail': r're:https://.+\.webp.*',
1367 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
1368 'duration': 29,
1369 'timestamp': 1654680400,
1370 'repost_count': int,
1371 'artist': 'Akihitoko',
1372 'track': 'original sound',
1373 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
1374 'uploader': 'akihitoko1',
1375 'creator': 'Akihitoko',
1376 },
49895f06 1377 }, {
1378 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
1379 'only_matching': True,
1380 }]
1381
88afe056 1382 def _real_extract(self, url):
11e1c2e3 1383 new_url = self._request_webpage(
3d2623a8 1384 HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
11e1c2e3 1385 if self.suitable(new_url): # Prevent infinite loop in case redirect fails
1386 raise UnsupportedError(new_url)
1387 return self.url_result(new_url)
933ed882
JC
1388
1389
216bcb66 1390class TikTokLiveIE(TikTokBaseIE):
1391 _VALID_URL = r'''(?x)https?://(?:
1392 (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
1393 m\.tiktok\.com/share/live/(?P<id>\d+)
1394 )'''
933ed882
JC
1395 IE_NAME = 'tiktok:live'
1396
1397 _TESTS = [{
216bcb66 1398 'url': 'https://www.tiktok.com/@weathernewslive/live',
1399 'info_dict': {
1400 'id': '7210809319192726273',
1401 'ext': 'mp4',
1402 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
1403 'creator': 'ウェザーニュースLiVE',
1404 'uploader': 'weathernewslive',
1405 'uploader_id': '6621496731283095554',
1406 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
1407 'live_status': 'is_live',
1408 'concurrent_view_count': int,
1409 },
1410 'params': {'skip_download': 'm3u8'},
1411 }, {
1412 'url': 'https://www.tiktok.com/@pilarmagenta/live',
1413 'info_dict': {
1414 'id': '7209423610325322522',
1415 'ext': 'mp4',
1416 'title': str,
1417 'creator': 'Pilarmagenta',
1418 'uploader': 'pilarmagenta',
1419 'uploader_id': '6624846890674683909',
1420 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
1421 'live_status': 'is_live',
1422 'concurrent_view_count': int,
1423 },
1424 'skip': 'Livestream',
1425 }, {
1426 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
1427 'only_matching': True,
1428 }, {
933ed882
JC
1429 'url': 'https://www.tiktok.com/@iris04201/live',
1430 'only_matching': True,
1431 }]
1432
216bcb66 1433 def _call_api(self, url, param, room_id, uploader, key=None):
1434 response = traverse_obj(self._download_json(
1435 url, room_id, fatal=False, query={
1436 'aid': '1988',
1437 param: room_id,
1438 }), (key, {dict}), default={})
1439
1440 # status == 2 if live else 4
1441 if int_or_none(response.get('status')) == 2:
1442 return response
1443 # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
1444 elif not uploader:
1445 raise ExtractorError('This livestream has ended', expected=True)
1446 raise UserNotLive(video_id=uploader)
1447
933ed882 1448 def _real_extract(self, url):
216bcb66 1449 uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
1450 webpage = self._download_webpage(
1451 url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
1452
1453 if webpage:
d9b4154c 1454 data = self._get_sigi_state(webpage, uploader or room_id)
216bcb66 1455 room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
1456 or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
1457 or room_id)
1458 uploader = uploader or traverse_obj(
1459 data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
1460 ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
1461
933ed882
JC
1462 if not room_id:
1463 raise UserNotLive(video_id=uploader)
933ed882 1464
216bcb66 1465 formats = []
1466 live_info = self._call_api(
1467 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
1468
1469 get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
1470 parse_inner = lambda x: self._parse_json(x, None)
1471
1472 for quality, stream in traverse_obj(live_info, (
1473 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
1474 {parse_inner}, 'data', {dict}), default={}).items():
1475
1476 sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
1477 'vcodec': ('VCodec', {str}),
1478 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
1479 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
1480 }))
1481
1482 flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
1483 if flv_url:
1484 formats.append({
1485 'url': flv_url,
1486 'ext': 'flv',
1487 'format_id': f'flv-{quality}',
1488 'quality': get_quality(quality),
1489 **sdk_params,
1490 })
1491
1492 hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
1493 if hls_url:
1494 formats.append({
1495 'url': hls_url,
1496 'ext': 'mp4',
1497 'protocol': 'm3u8_native',
1498 'format_id': f'hls-{quality}',
1499 'quality': get_quality(quality),
1500 **sdk_params,
1501 })
1502
1503 def get_vcodec(*keys):
1504 return traverse_obj(live_info, (
1505 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
1506
1507 for stream in ('hls', 'rtmp'):
1508 stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
1509 if stream_url:
1510 formats.append({
1511 'url': stream_url,
1512 'ext': 'mp4' if stream == 'hls' else 'flv',
1513 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
1514 'format_id': f'{stream}-pull',
1515 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
1516 'quality': get_quality('ORIGION'),
1517 })
1518
1519 for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
1520 if not url_or_none(f_url):
1521 continue
1522 formats.append({
1523 'url': f_url,
1524 'ext': 'flv',
1525 'format_id': f'flv-{f_id}'.lower(),
1526 'vcodec': get_vcodec('flv_pull_url_params', f_id),
1527 'quality': get_quality(f_id),
1528 })
1529
1530 # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
1531 if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
1532 live_info = merge_dicts(live_info, self._call_api(
1533 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
1534 if url_or_none(live_info.get('liveUrl')):
1535 formats.append({
1536 'url': live_info['liveUrl'],
1537 'ext': 'mp4',
1538 'protocol': 'm3u8_native',
1539 'format_id': 'hls-fallback',
1540 'vcodec': 'h264',
1541 'quality': get_quality('origin'),
1542 })
1543
1544 uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
933ed882
JC
1545
1546 return {
1547 'id': room_id,
933ed882 1548 'uploader': uploader,
216bcb66 1549 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
933ed882 1550 'is_live': True,
216bcb66 1551 'formats': formats,
1552 '_format_sort_fields': ('quality', 'ext'),
1553 **traverse_obj(live_info, {
1554 'title': 'title',
1555 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
1556 'creator': (('ownerInfo', 'owner'), 'nickname'),
1557 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
1558 }, get_all=False),
933ed882 1559 }