]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
3584b839 1import functools
f7f18f90 2import itertools
b801cd71 3import json
bd9ff55b 4import random
216bcb66 5import re
347f13dd 6import string
bd9ff55b 7import time
add96eb9 8import urllib.parse
cb61e20c 9import uuid
1ead840d
KS
10
11from .common import InfoExtractor
3d2623a8 12from ..networking import HEADRequest
1ead840d 13from ..utils import (
ce18a19b 14 ExtractorError,
11e1c2e3 15 UnsupportedError,
933ed882 16 UserNotLive,
8ceb07e8 17 determine_ext,
3584b839 18 filter_dict,
216bcb66 19 format_field,
1ead840d 20 int_or_none,
34921b43 21 join_nonempty,
216bcb66 22 merge_dicts,
4ccd73fe 23 mimetype2ext,
24 parse_qs,
b801cd71 25 qualities,
ba723997 26 remove_start,
e0585e65 27 srt_subtitles_timecode,
1ead840d 28 str_or_none,
bd9ff55b 29 traverse_obj,
216bcb66 30 try_call,
bd9ff55b 31 try_get,
943d5ab1 32 url_or_none,
1ead840d
KS
33)
34
35
0fd6661e 36class TikTokBaseIE(InfoExtractor):
943d5ab1 37 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
53dad39e 38 _WEBPAGE_HOST = 'https://www.tiktok.com/'
be1f331f 39 QUALITIES = ('360p', '540p', '720p', '1080p')
ce18a19b 40
cb61e20c 41 _APP_INFO_DEFAULTS = {
42 # unique "install id"
43 'iid': None,
44 # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
45 'app_name': 'musical_ly',
46 'app_version': '34.1.2',
47 'manifest_app_version': '2023401020',
48 # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
49 'aid': '0',
50 }
cb61e20c 51 _APP_INFO_POOL = None
52 _APP_INFO = None
53 _APP_USER_AGENT = None
54
3584b839 55 @functools.cached_property
41ba4a80 56 def _KNOWN_APP_INFO(self):
3584b839 57 # If we have a genuine device ID, we may not need any IID
58 default = [''] if self._KNOWN_DEVICE_ID else []
59 return self._configuration_arg('app_info', default, ie_key=TikTokIE)
41ba4a80 60
3584b839 61 @functools.cached_property
62 def _KNOWN_DEVICE_ID(self):
63 return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0]
64
65 @functools.cached_property
66 def _DEVICE_ID(self):
67 return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000))
68
69 @functools.cached_property
c4cbd3be 70 def _API_HOSTNAME(self):
71 return self._configuration_arg(
41ba4a80 72 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
c4cbd3be 73
cb61e20c 74 def _get_next_app_info(self):
75 if self._APP_INFO_POOL is None:
76 defaults = {
77 key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
78 for key, default in self._APP_INFO_DEFAULTS.items()
79 if key != 'iid'
80 }
cb61e20c 81 self._APP_INFO_POOL = [
82 {**defaults, **dict(
83 (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
41ba4a80 84 )} for app_info in self._KNOWN_APP_INFO
cb61e20c 85 ]
86
87 if not self._APP_INFO_POOL:
88 return False
89
90 self._APP_INFO = self._APP_INFO_POOL.pop(0)
91
92 app_name = self._APP_INFO['app_name']
93 version = self._APP_INFO['manifest_app_version']
94 if app_name == 'musical_ly':
95 package = f'com.zhiliaoapp.musically/{version}'
96 else: # trill, aweme
97 package = f'com.ss.android.ugc.{app_name}/{version}'
98 self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
99
100 return True
101
b801cd71 102 @staticmethod
103 def _create_url(user_id, video_id):
104 return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
105
a39a7ba8 106 def _get_sigi_state(self, webpage, display_id):
069cbece 107 return self._search_json(
108 r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
d9b4154c 109 'sigi state', display_id, end_pattern=r'</script>', default={})
110
111 def _get_universal_data(self, webpage, display_id):
112 return traverse_obj(self._search_json(
113 r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
114 'universal data', display_id, end_pattern=r'</script>', default={}),
115 ('__DEFAULT_SCOPE__', {dict})) or {}
a39a7ba8 116
cb61e20c 117 def _call_api_impl(self, ep, query, video_id, fatal=True,
046cab39 118 note='Downloading API JSON', errnote='Unable to download API page'):
efa944f4 119 self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
046cab39
M
120 webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
121 if webpage_cookies.get('sid_tt'):
122 self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
123 return self._download_json(
add96eb9 124 f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id,
046cab39 125 fatal=fatal, note=note, errnote=errnote, headers={
cb61e20c 126 'User-Agent': self._APP_USER_AGENT,
046cab39
M
127 'Accept': 'application/json',
128 }, query=query)
129
cb61e20c 130 def _build_api_query(self, query):
3584b839 131 return filter_dict({
0fd6661e 132 **query,
bd9ff55b 133 'device_platform': 'android',
cb61e20c 134 'os': 'android',
135 'ssmix': 'a',
136 '_rticket': int(time.time() * 1000),
137 'cdid': str(uuid.uuid4()),
138 'channel': 'googleplay',
139 'aid': self._APP_INFO['aid'],
140 'app_name': self._APP_INFO['app_name'],
add96eb9 141 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')),
cb61e20c 142 'version_name': self._APP_INFO['app_version'],
143 'manifest_version_code': self._APP_INFO['manifest_app_version'],
144 'update_version_code': self._APP_INFO['manifest_app_version'],
145 'ab_version': self._APP_INFO['app_version'],
c2a1bdb0 146 'resolution': '1080*2400',
bd9ff55b 147 'dpi': 420,
cb61e20c 148 'device_type': 'Pixel 7',
149 'device_brand': 'Google',
150 'language': 'en',
bd9ff55b 151 'os_api': '29',
cb61e20c 152 'os_version': '13',
153 'ac': 'wifi',
154 'is_pad': '0',
155 'current_region': 'US',
156 'app_type': 'normal',
bd9ff55b 157 'sys_region': 'US',
cb61e20c 158 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
bd9ff55b 159 'timezone_name': 'America/New_York',
cb61e20c 160 'residence': 'US',
161 'app_language': 'en',
bd9ff55b 162 'timezone_offset': '-14400',
cb61e20c 163 'host_abi': 'armeabi-v7a',
164 'locale': 'en',
165 'ac2': 'wifi5g',
166 'uoo': '1',
fc53ec13 167 'carrier_region': 'US',
cb61e20c 168 'op_region': 'US',
169 'build_number': self._APP_INFO['app_version'],
170 'region': 'US',
171 'ts': int(time.time()),
3584b839 172 'iid': self._APP_INFO.get('iid'),
173 'device_id': self._DEVICE_ID,
cb61e20c 174 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
3584b839 175 })
046cab39
M
176
177 def _call_api(self, ep, query, video_id, fatal=True,
178 note='Downloading API JSON', errnote='Unable to download API page'):
cb61e20c 179 if not self._APP_INFO and not self._get_next_app_info():
180 message = 'No working app info is available'
181 if fatal:
182 raise ExtractorError(message, expected=True)
183 else:
184 self.report_warning(message)
185 return
186
187 max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
188 for count in itertools.count(1):
189 self.write_debug(str(self._APP_INFO))
190 real_query = self._build_api_query(query)
046cab39 191 try:
cb61e20c 192 return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
046cab39
M
193 except ExtractorError as e:
194 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
cb61e20c 195 message = str(e.cause or e.msg)
196 if not self._get_next_app_info():
046cab39 197 if fatal:
cb61e20c 198 raise
046cab39 199 else:
cb61e20c 200 self.report_warning(message)
046cab39 201 return
cb61e20c 202 self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
046cab39 203 continue
cb61e20c 204 raise
0fd6661e 205
ba723997 206 def _extract_aweme_app(self, aweme_id):
207 feed_list = self._call_api(
208 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed',
209 errnote='Unable to download video feed').get('aweme_list') or []
210 aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
211 if not aweme_detail:
212 raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
213 return self._parse_aweme_video_app(aweme_detail)
214
eef1e9f4 215 def _extract_web_data_and_status(self, url, video_id, fatal=True):
216 webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or ''
217 video_data, status = {}, None
218
219 if universal_data := self._get_universal_data(webpage, video_id):
220 self.write_debug('Found universal data for rehydration')
221 status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
222 video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
223
224 elif sigi_data := self._get_sigi_state(webpage, video_id):
225 self.write_debug('Found sigi state data')
226 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
227 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
228
229 elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
230 self.write_debug('Found next.js data')
231 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
232 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
233
234 elif fatal:
235 raise ExtractorError('Unable to extract webpage video data')
236
237 return video_data, status
238
347f13dd 239 def _get_subtitles(self, aweme_detail, aweme_id, user_name):
e0585e65
M
240 # TODO: Extract text positioning info
241 subtitles = {}
ba723997 242 # aweme/detail endpoint subs
e0585e65 243 captions_info = traverse_obj(
ba723997 244 aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
e0585e65
M
245 for caption in captions_info:
246 caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
247 if not caption_url:
248 continue
249 caption_json = self._download_json(
250 caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
251 if not caption_json:
252 continue
253 subtitles.setdefault(caption.get('language', 'en'), []).append({
254 'ext': 'srt',
255 'data': '\n\n'.join(
256 f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
add96eb9 257 for i, line in enumerate(caption_json['utterances']) if line.get('text')),
e0585e65 258 })
ba723997 259 # feed endpoint subs
260 if not subtitles:
261 for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
262 if not caption.get('url'):
263 continue
264 subtitles.setdefault(caption.get('lang') or 'en', []).append({
265 'ext': remove_start(caption.get('caption_format'), 'web'),
266 'url': caption['url'],
267 })
268 # webpage subs
269 if not subtitles:
347f13dd 270 if user_name: # only _parse_aweme_video_app needs to extract the webpage here
eef1e9f4 271 aweme_detail, _ = self._extract_web_data_and_status(
347f13dd 272 self._create_url(user_name, aweme_id), aweme_id, fatal=False)
eef1e9f4 273 for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
ba723997 274 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
275 'ext': remove_start(caption.get('Format'), 'web'),
276 'url': caption['Url'],
277 })
e0585e65
M
278 return subtitles
279
4ccd73fe 280 def _parse_url_key(self, url_key):
281 format_id, codec, res, bitrate = self._search_regex(
282 r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
283 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
284 if not format_id:
285 return {}, None
286 return {
287 'format_id': format_id,
288 'vcodec': 'h265' if codec == 'bytevc1' else codec,
289 'tbr': int_or_none(bitrate, scale=1000) or None,
290 'quality': qualities(self.QUALITIES)(res),
291 }, res
292
943d5ab1 293 def _parse_aweme_video_app(self, aweme_detail):
0fd6661e 294 aweme_id = aweme_detail['aweme_id']
bd9ff55b 295 video_info = aweme_detail['video']
bd9ff55b
M
296 known_resolutions = {}
297
b09bd0c1 298 def audio_meta(url):
299 ext = determine_ext(url, default_ext='m4a')
8ceb07e8 300 return {
301 'format_note': 'Music track',
b09bd0c1 302 'ext': ext,
303 'acodec': 'aac' if ext == 'm4a' else ext,
8ceb07e8 304 'vcodec': 'none',
305 'width': None,
306 'height': None,
b09bd0c1 307 } if ext == 'mp3' or '-music-' in url else {}
8ceb07e8 308
bd9ff55b 309 def extract_addr(addr, add_meta={}):
4ccd73fe 310 parsed_meta, res = self._parse_url_key(addr.get('url_key', ''))
63f685f3 311 is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
bd9ff55b 312 if res:
9ff94664 313 known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
314 known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
bd9ff55b
M
315 parsed_meta.update(known_resolutions.get(res, {}))
316 add_meta.setdefault('height', int_or_none(res[:-1]))
317 return [{
318 'url': url,
319 'filesize': int_or_none(addr.get('data_size')),
320 'ext': 'mp4',
321 'acodec': 'aac',
0fd6661e
M
322 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
323 **add_meta, **parsed_meta,
4ccd73fe 324 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
63f685f3 325 'preference': -100 if is_bytevc2 else -1,
34921b43 326 'format_note': join_nonempty(
63f685f3 327 add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
328 '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
b09bd0c1 329 **audio_meta(url),
bd9ff55b
M
330 } for url in addr.get('url_list') or []]
331
332 # Hack: Add direct video links first to prioritize them when removing duplicate formats
333 formats = []
9ff94664 334 width = int_or_none(video_info.get('width'))
335 height = int_or_none(video_info.get('height'))
4ccd73fe 336 ratio = try_call(lambda: width / height) or 0.5625
bd9ff55b
M
337 if video_info.get('play_addr'):
338 formats.extend(extract_addr(video_info['play_addr'], {
339 'format_id': 'play_addr',
340 'format_note': 'Direct video',
341 'vcodec': 'h265' if traverse_obj(
be1f331f 342 video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
9ff94664 343 'width': width,
344 'height': height,
bd9ff55b
M
345 }))
346 if video_info.get('download_addr'):
9ff94664 347 download_addr = video_info['download_addr']
348 dl_width = int_or_none(download_addr.get('width'))
349 formats.extend(extract_addr(download_addr, {
bd9ff55b
M
350 'format_id': 'download_addr',
351 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
352 'vcodec': 'h264',
4ccd73fe 353 'width': dl_width,
354 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong
0fd6661e 355 'preference': -2 if video_info.get('has_watermark') else -1,
bd9ff55b
M
356 }))
357 if video_info.get('play_addr_h264'):
358 formats.extend(extract_addr(video_info['play_addr_h264'], {
359 'format_id': 'play_addr_h264',
360 'format_note': 'Direct video',
361 'vcodec': 'h264',
362 }))
363 if video_info.get('play_addr_bytevc1'):
364 formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
365 'format_id': 'play_addr_bytevc1',
366 'format_note': 'Direct video',
367 'vcodec': 'h265',
368 }))
369
370 for bitrate in video_info.get('bit_rate', []):
371 if bitrate.get('play_addr'):
372 formats.extend(extract_addr(bitrate['play_addr'], {
373 'format_id': bitrate.get('gear_name'),
374 'format_note': 'Playback video',
375 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
376 'vcodec': 'h265' if traverse_obj(
377 bitrate, 'is_bytevc1', 'is_h265') else 'h264',
943d5ab1 378 'fps': bitrate.get('FPS'),
bd9ff55b
M
379 }))
380
381 self._remove_duplicate_formats(formats)
6134fbeb
M
382 auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
383 if auth_cookie:
384 for f in formats:
add96eb9 385 self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
bd9ff55b
M
386
387 thumbnails = []
388 for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
389 'origin_cover', 'dynamic_cover'):
92593690 390 for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
391 thumbnails.append({
392 'id': cover_id,
393 'url': cover_url,
394 })
395
396 stats_info = aweme_detail.get('statistics') or {}
92593690 397 music_info = aweme_detail.get('music') or {}
6839ae1f 398 labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
bd9ff55b
M
399
400 contained_music_track = traverse_obj(
401 music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
402 contained_music_author = traverse_obj(
403 music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
404
add96eb9 405 is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle'))
bd9ff55b
M
406 if is_generic_og_trackname:
407 music_track, music_author = contained_music_track or 'original sound', contained_music_author
408 else:
f4f9f6d0 409 music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str}))
bd9ff55b 410
347f13dd 411 author_info = traverse_obj(aweme_detail, ('author', {
412 'uploader': ('unique_id', {str}),
413 'uploader_id': ('uid', {str_or_none}),
414 'channel': ('nickname', {str}),
415 'channel_id': ('sec_uid', {str}),
416 }))
417
bd9ff55b
M
418 return {
419 'id': aweme_id,
92593690 420 **traverse_obj(aweme_detail, {
421 'title': ('desc', {str}),
422 'description': ('desc', {str}),
423 'timestamp': ('create_time', {int_or_none}),
424 }),
425 **traverse_obj(stats_info, {
426 'view_count': 'play_count',
427 'like_count': 'digg_count',
428 'repost_count': 'share_count',
429 'comment_count': 'comment_count',
430 }, expected_type=int_or_none),
347f13dd 431 **author_info,
432 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
433 'uploader_url': format_field(
434 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
bd9ff55b
M
435 'track': music_track,
436 'album': str_or_none(music_info.get('album')) or None,
f4f9f6d0 437 'artists': re.split(r'(?:, | & )', music_author) if music_author else None,
bd9ff55b 438 'formats': formats,
347f13dd 439 'subtitles': self.extract_subtitles(
440 aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')),
bd9ff55b 441 'thumbnails': thumbnails,
347f13dd 442 'duration': (traverse_obj(video_info, (
443 (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any))
444 or traverse_obj(music_info, ('duration', {int_or_none}))),
53dad39e
M
445 'availability': self._availability(
446 is_private='Private' in labels,
447 needs_subscription='Friends only' in labels,
9f14daf2 448 is_unlisted='Followers only' in labels),
449 '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
bd9ff55b
M
450 }
451
347f13dd 452 def _extract_web_formats(self, aweme_detail):
4ccd73fe 453 COMMON_FORMAT_INFO = {
454 'ext': 'mp4',
455 'vcodec': 'h264',
456 'acodec': 'aac',
457 }
347f13dd 458 video_info = traverse_obj(aweme_detail, ('video', {dict})) or {}
459 play_width = int_or_none(video_info.get('width'))
460 play_height = int_or_none(video_info.get('height'))
461 ratio = try_call(lambda: play_width / play_height) or 0.5625
462 formats = []
4ccd73fe 463
464 for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])):
465 format_info, res = self._parse_url_key(
466 traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '')
467 # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
468 is_bytevc2 = format_info.get('vcodec') == 'bytevc2'
469 format_info.update({
470 'format_note': 'UNPLAYABLE' if is_bytevc2 else None,
471 'preference': -100 if is_bytevc2 else -1,
472 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})),
473 })
474
475 if dimension := (res and int(res[:-1])):
476 if dimension == 540: # '540p' is actually 576p
477 dimension = 576
478 if ratio < 1: # portrait: res/dimension is width
479 y = int(dimension / ratio)
480 format_info.update({
481 'width': dimension,
482 'height': y - (y % 2),
483 })
484 else: # landscape: res/dimension is height
485 x = int(dimension * ratio)
486 format_info.update({
347f13dd 487 'width': x + (x % 2),
4ccd73fe 488 'height': dimension,
489 })
490
491 for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})):
492 formats.append({
493 **COMMON_FORMAT_INFO,
494 **format_info,
495 'url': self._proto_relative_url(video_url),
496 })
497
498 # We don't have res string for play formats, but need quality for sorting & de-duplication
347f13dd 499 play_quality = traverse_obj(formats, (lambda _, v: v['width'] == play_width, 'quality', any))
92593690 500
501 for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
502 formats.append({
4ccd73fe 503 **COMMON_FORMAT_INFO,
504 'format_id': 'play',
943d5ab1 505 'url': self._proto_relative_url(play_url),
347f13dd 506 'width': play_width,
507 'height': play_height,
4ccd73fe 508 'quality': play_quality,
92593690 509 })
943d5ab1 510
92593690 511 for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
943d5ab1 512 formats.append({
4ccd73fe 513 **COMMON_FORMAT_INFO,
943d5ab1
M
514 'format_id': 'download',
515 'url': self._proto_relative_url(download_url),
943d5ab1 516 })
92593690 517
943d5ab1 518 self._remove_duplicate_formats(formats)
943d5ab1 519
4ccd73fe 520 for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']):
521 f.update({
522 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '),
523 'preference': f.get('preference') or -2,
524 })
525
526 # Is it a slideshow with only audio for download?
347f13dd 527 if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})):
528 audio_url = aweme_detail['music']['playUrl']
4ccd73fe 529 ext = traverse_obj(parse_qs(audio_url), (
530 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a'
531 formats.append({
532 'format_id': 'audio',
533 'url': self._proto_relative_url(audio_url),
534 'ext': ext,
535 'acodec': 'aac' if ext == 'm4a' else ext,
536 'vcodec': 'none',
537 })
538
347f13dd 539 return formats
540
541 def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False):
542 author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), {
543 'channel': ('nickname', {str}),
544 'channel_id': (('authorSecId', 'secUid'), {str}),
545 'uploader': (('uniqueId', 'author'), {str}),
546 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
547 }), get_all=False)
943d5ab1
M
548
549 return {
92593690 550 'id': video_id,
347f13dd 551 'formats': None if extract_flat else self._extract_web_formats(aweme_detail),
552 'subtitles': None if extract_flat else self.extract_subtitles(aweme_detail, video_id, None),
553 'http_headers': {'Referer': webpage_url},
554 **author_info,
555 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
556 'uploader_url': format_field(
557 author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
558 **traverse_obj(aweme_detail, ('music', {
4ccd73fe 559 'track': ('title', {str}),
560 'album': ('album', {str}, {lambda x: x or None}),
347f13dd 561 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}),
4ccd73fe 562 'duration': ('duration', {int_or_none}),
347f13dd 563 })),
92593690 564 **traverse_obj(aweme_detail, {
565 'title': ('desc', {str}),
566 'description': ('desc', {str}),
4ccd73fe 567 # audio-only slideshows have a video duration of 0 and an actual audio duration
568 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}),
92593690 569 'timestamp': ('createTime', {int_or_none}),
570 }),
347f13dd 571 **traverse_obj(aweme_detail, ('stats', {
92593690 572 'view_count': 'playCount',
573 'like_count': 'diggCount',
574 'repost_count': 'shareCount',
575 'comment_count': 'commentCount',
347f13dd 576 }), expected_type=int_or_none),
577 'thumbnails': traverse_obj(aweme_detail, (
578 (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {
579 'url': ({url_or_none}, {self._proto_relative_url}),
580 },
581 )),
943d5ab1
M
582 }
583
0fd6661e
M
584
585class TikTokIE(TikTokBaseIE):
c4cbd3be 586 _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
bfd973ec 587 _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
0fd6661e
M
588
589 _TESTS = [{
590 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
0481e266 591 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
0fd6661e
M
592 'info_dict': {
593 'id': '6748451240264420610',
594 'ext': 'mp4',
595 'title': '#jassmanak #lehanga #leenabhushan',
596 'description': '#jassmanak #lehanga #leenabhushan',
597 'duration': 13,
0481e266 598 'height': 1024,
599 'width': 576,
0fd6661e
M
600 'uploader': 'leenabhushan',
601 'uploader_id': '6691488002098119685',
0481e266 602 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
0fd6661e
M
603 'creator': 'facestoriesbyleenabh',
604 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
605 'upload_date': '20191016',
606 'timestamp': 1571246252,
607 'view_count': int,
608 'like_count': int,
609 'repost_count': int,
610 'comment_count': int,
a44ca5a4 611 'artist': 'Ysrbeats',
612 'album': 'Lehanga',
613 'track': 'Lehanga',
92593690 614 },
615 'skip': '404 Not Found',
0fd6661e
M
616 }, {
617 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
347f13dd 618 'md5': 'f21112672ee4ce05ca390fb6522e1b6f',
0fd6661e
M
619 'info_dict': {
620 'id': '6742501081818877190',
621 'ext': 'mp4',
622 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
623 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
624 'duration': 27,
347f13dd 625 'height': 1024,
626 'width': 576,
0fd6661e
M
627 'uploader': 'patrox',
628 'uploader_id': '18702747',
347f13dd 629 'uploader_url': 'https://www.tiktok.com/@patrox',
630 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
92593690 631 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
f4f9f6d0 632 'channel': 'patroX',
0fd6661e
M
633 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
634 'upload_date': '20190930',
635 'timestamp': 1569860870,
636 'view_count': int,
637 'like_count': int,
638 'repost_count': int,
639 'comment_count': int,
f4f9f6d0 640 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'],
a44ca5a4 641 'track': 'Big Fun',
92593690 642 },
0fd6661e 643 }, {
347f13dd 644 # Banned audio, was available on the app, now works with web too
96f13f01
M
645 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
646 'info_dict': {
647 'id': '6984138651336838402',
648 'ext': 'mp4',
649 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
650 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
651 'uploader': 'barudakhb_',
f4f9f6d0 652 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
96f13f01 653 'uploader_id': '6974687867511718913',
347f13dd 654 'uploader_url': 'https://www.tiktok.com/@barudakhb_',
655 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
92593690 656 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
96f13f01 657 'track': 'Boka Dance',
f4f9f6d0 658 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'],
96f13f01
M
659 'timestamp': 1626121503,
660 'duration': 18,
661 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
662 'upload_date': '20210712',
663 'view_count': int,
664 'like_count': int,
665 'repost_count': int,
666 'comment_count': int,
92593690 667 },
96f13f01
M
668 }, {
669 # Sponsored video, only available with feed workaround
670 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
671 'info_dict': {
672 'id': '7042692929109986561',
673 'ext': 'mp4',
674 'title': 'Slap and Run!',
675 'description': 'Slap and Run!',
676 'uploader': 'user440922249',
f4f9f6d0 677 'channel': 'Slap And Run',
96f13f01
M
678 'uploader_id': '7036055384943690754',
679 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
92593690 680 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
96f13f01
M
681 'track': 'Promoted Music',
682 'timestamp': 1639754738,
683 'duration': 30,
684 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
685 'upload_date': '20211217',
686 'view_count': int,
687 'like_count': int,
688 'repost_count': int,
689 'comment_count': int,
690 },
347f13dd 691 'skip': 'This video is unavailable',
5fa3c9a8
HTL
692 }, {
693 # Video without title and description
694 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
695 'info_dict': {
696 'id': '7059698374567611694',
697 'ext': 'mp4',
b801cd71 698 'title': 'TikTok video #7059698374567611694',
5fa3c9a8
HTL
699 'description': '',
700 'uploader': 'pokemonlife22',
f4f9f6d0 701 'channel': 'Pokemon',
5fa3c9a8 702 'uploader_id': '6820838815978423302',
347f13dd 703 'uploader_url': 'https://www.tiktok.com/@pokemonlife22',
704 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
92593690 705 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8
HTL
706 'track': 'original sound',
707 'timestamp': 1643714123,
708 'duration': 6,
709 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
710 'upload_date': '20220201',
f4f9f6d0 711 'artists': ['Pokemon'],
5fa3c9a8
HTL
712 'view_count': int,
713 'like_count': int,
714 'repost_count': int,
715 'comment_count': int,
716 },
a39a7ba8 717 }, {
718 # hydration JSON is sent in a <script> element
719 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
720 'info_dict': {
721 'id': '7065799023130643713',
722 'ext': 'mp4',
723 'title': '#denidil#денидил',
724 'description': '#denidil#денидил',
725 'uploader': 'denidil6',
726 'uploader_id': '7046664115636405250',
727 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
728 'artist': 'Holocron Music',
729 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
730 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
731 'timestamp': 1645134536,
732 'duration': 26,
733 'upload_date': '20220217',
734 'view_count': int,
735 'like_count': int,
736 'repost_count': int,
737 'comment_count': int,
738 },
f7c5a5e9 739 'skip': 'This video is unavailable',
8ceb07e8 740 }, {
741 # slideshow audio-only mp3 format
742 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
743 'info_dict': {
744 'id': '7139980461132074283',
745 'ext': 'mp3',
746 'title': 'TikTok video #7139980461132074283',
747 'description': '',
f4f9f6d0 748 'channel': 'Antaura',
8ceb07e8 749 'uploader': '_le_cannibale_',
750 'uploader_id': '6604511138619654149',
347f13dd 751 'uploader_url': 'https://www.tiktok.com/@_le_cannibale_',
752 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
92593690 753 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
f4f9f6d0 754 'artists': ['nathan !'],
8ceb07e8 755 'track': 'grahamscott canon',
347f13dd 756 'duration': 10,
8ceb07e8 757 'upload_date': '20220905',
758 'timestamp': 1662406249,
759 'view_count': int,
760 'like_count': int,
761 'repost_count': int,
762 'comment_count': int,
f4f9f6d0 763 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
8ceb07e8 764 },
92593690 765 }, {
766 # only available via web
347f13dd 767 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
768 'md5': '4cdefa501ac8ac20bf04986e10916fea',
92593690 769 'info_dict': {
770 'id': '7206382937372134662',
771 'ext': 'mp4',
772 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
773 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
f4f9f6d0 774 'channel': 'MoxyPatch',
92593690 775 'uploader': 'moxypatch',
776 'uploader_id': '7039142049363379205',
347f13dd 777 'uploader_url': 'https://www.tiktok.com/@moxypatch',
778 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
92593690 779 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
d9b4154c 780 'artists': ['your worst nightmare'],
92593690 781 'track': 'original sound',
782 'upload_date': '20230303',
783 'timestamp': 1677866781,
784 'duration': 10,
785 'view_count': int,
786 'like_count': int,
787 'repost_count': int,
788 'comment_count': int,
789 'thumbnail': r're:^https://.+',
790 'thumbnails': 'count:3',
791 },
792 'expected_warnings': ['Unable to find video in feed'],
c2a1bdb0 793 }, {
794 # 1080p format
add96eb9 795 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio
c2a1bdb0 796 'md5': '982512017a8a917124d5a08c8ae79621',
797 'info_dict': {
798 'id': '7107337212743830830',
799 'ext': 'mp4',
800 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
801 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
802 'uploader': 'tatemcrae',
803 'uploader_id': '86328792343818240',
804 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
805 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
f4f9f6d0 806 'channel': 'tate mcrae',
f4f9f6d0 807 'artists': ['tate mcrae'],
c2a1bdb0 808 'track': 'original sound',
809 'upload_date': '20220609',
810 'timestamp': 1654805899,
811 'duration': 150,
812 'view_count': int,
813 'like_count': int,
814 'repost_count': int,
815 'comment_count': int,
816 'thumbnail': r're:^https://.+\.webp',
817 },
347f13dd 818 'skip': 'Unavailable via feed API, only audio available via web',
b09bd0c1 819 }, {
820 # Slideshow, audio-only m4a format
821 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
822 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
823 'info_dict': {
824 'id': '7253412088251534594',
825 'ext': 'm4a',
826 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
827 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
828 'uploader': 'hara_yoimiya',
829 'uploader_id': '6582536342634676230',
347f13dd 830 'uploader_url': 'https://www.tiktok.com/@hara_yoimiya',
831 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
b09bd0c1 832 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
347f13dd 833 'channel': 'лампочка(!)',
f4f9f6d0 834 'artists': ['Øneheart'],
b09bd0c1 835 'album': 'watching the stars',
836 'track': 'watching the stars',
347f13dd 837 'duration': 60,
b09bd0c1 838 'upload_date': '20230708',
839 'timestamp': 1688816612,
840 'view_count': int,
841 'like_count': int,
842 'comment_count': int,
843 'repost_count': int,
f4f9f6d0 844 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
b09bd0c1 845 },
e0585e65
M
846 }, {
847 # Auto-captions available
848 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
add96eb9 849 'only_matching': True,
0fd6661e
M
850 }]
851
ce18a19b 852 def _real_extract(self, url):
b801cd71 853 video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
41ba4a80 854
855 if self._KNOWN_APP_INFO:
856 try:
857 return self._extract_aweme_app(video_id)
858 except ExtractorError as e:
859 e.expected = True
860 self.report_warning(f'{e}; trying with webpage')
bd9ff55b 861
a39a7ba8 862 url = self._create_url(user_id, video_id)
eef1e9f4 863 video_data, status = self._extract_web_data_and_status(url, video_id)
11aa91a1 864
d9b4154c 865 if video_data and status == 0:
92593690 866 return self._parse_aweme_video_web(video_data, url, video_id)
1418a043 867 elif status == 10216:
868 raise ExtractorError('This video is private', expected=True)
d9b4154c 869 raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
f7f18f90
A
870
871
0fd6661e 872class TikTokUserIE(TikTokBaseIE):
f7f18f90 873 IE_NAME = 'tiktok:user'
347f13dd 874 _VALID_URL = r'(?:tiktokuser:|https?://(?:www\.)?tiktok\.com/@)(?P<id>[\w.-]+)/?(?:$|[#?])'
f7f18f90 875 _TESTS = [{
526d74ec 876 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
877 'playlist_mincount': 45,
878 'info_dict': {
347f13dd 879 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
0481e266 880 'title': 'corgibobaa',
f7f18f90 881 },
5fa3c9a8
HTL
882 }, {
883 'url': 'https://www.tiktok.com/@6820838815978423302',
884 'playlist_mincount': 5,
885 'info_dict': {
347f13dd 886 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
5fa3c9a8 887 'title': '6820838815978423302',
5fa3c9a8 888 },
f7f18f90
A
889 }, {
890 'url': 'https://www.tiktok.com/@meme',
891 'playlist_mincount': 593,
892 'info_dict': {
347f13dd 893 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
0481e266 894 'title': 'meme',
f7f18f90 895 },
347f13dd 896 }, {
897 'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
898 'playlist_mincount': 31,
899 'info_dict': {
900 'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
901 },
f7f18f90 902 }]
347f13dd 903 _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
904 _API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
f7f18f90 905
347f13dd 906 def _build_web_query(self, sec_uid, cursor):
907 return {
908 'aid': '1988',
909 'app_language': 'en',
910 'app_name': 'tiktok_web',
911 'browser_language': 'en-US',
912 'browser_name': 'Mozilla',
913 'browser_online': 'true',
914 'browser_platform': 'Win32',
915 'browser_version': '5.0 (Windows)',
916 'channel': 'tiktok_web',
917 'cookie_enabled': 'true',
918 'count': '15',
919 'cursor': cursor,
920 'device_id': self._DEVICE_ID,
921 'device_platform': 'web_pc',
922 'focus_state': 'true',
923 'from_page': 'user',
924 'history_len': '2',
925 'is_fullscreen': 'false',
926 'is_page_visible': 'true',
927 'language': 'en',
928 'os': 'windows',
929 'priority_region': '',
930 'referer': '',
931 'region': 'US',
932 'screen_height': '1080',
933 'screen_width': '1920',
934 'secUid': sec_uid,
935 'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
936 'tz_name': 'UTC',
937 'verifyFp': f'verify_{"".join(random.choices(string.hexdigits, k=7))}',
938 'webcast_language': 'en',
0fd6661e
M
939 }
940
347f13dd 941 def _entries(self, sec_uid, user_name):
942 display_id = user_name or sec_uid
c53c2e40 943 seen_ids = set()
347f13dd 944
945 cursor = int(time.time() * 1E3)
0fd6661e 946 for page in itertools.count(1):
347f13dd 947 response = self._download_json(
948 self._API_BASE_URL, display_id, f'Downloading page {page}',
949 query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
b3187433 950
347f13dd 951 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
952 video_id = video['id']
c53c2e40 953 if video_id in seen_ids:
954 continue
955 seen_ids.add(video_id)
347f13dd 956 webpage_url = self._create_url(display_id, video_id)
957 yield self.url_result(
958 webpage_url, TikTokIE,
959 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
960
961 old_cursor = cursor
962 cursor = traverse_obj(
c53c2e40 963 response, ('itemList', -1, 'createTime', {lambda x: int(x * 1E3)}))
964 if not cursor or old_cursor == cursor:
347f13dd 965 # User may not have posted within this ~1 week lookback, so manually adjust cursor
966 cursor = old_cursor - 7 * 86_400_000
967 # In case 'hasMorePrevious' is wrong, break if we have gone back before TikTok existed
968 if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
969 break
b3187433 970
347f13dd 971 def _get_sec_uid(self, user_url, user_name, msg):
972 webpage = self._download_webpage(
973 user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
974 note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
975 return (traverse_obj(self._get_universal_data(webpage, user_name),
976 ('webapp.user-detail', 'userInfo', 'user', 'secUid', {str}))
977 or traverse_obj(self._get_sigi_state(webpage, user_name),
978 ('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid', {str}),
979 ('UserModule', 'users', ..., 'secUid', {str}, any)))
b3187433 980
347f13dd 981 def _real_extract(self, url):
982 user_name, sec_uid = self._match_id(url), None
983 if mobj := re.fullmatch(r'MS4wLjABAAAA[\w-]{64}', user_name):
984 user_name, sec_uid = None, mobj.group(0)
985 else:
986 sec_uid = (self._get_sec_uid(self._UPLOADER_URL_FORMAT % user_name, user_name, 'user')
987 or self._get_sec_uid(self._UPLOADER_URL_FORMAT % f'{user_name}/live', user_name, 'live'))
988
989 if not sec_uid:
990 webpage = self._download_webpage(
991 f'https://www.tiktok.com/embed/@{user_name}', user_name,
992 note='Downloading user embed page', fatal=False) or ''
993 data = traverse_obj(self._search_json(
994 r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
995 webpage, 'data', user_name, default={}),
996 ('source', 'data', f'/embed/@{user_name}', {dict}))
997
998 for aweme_id in traverse_obj(data, ('videoList', ..., 'id', {str})):
999 webpage_url = self._create_url(user_name, aweme_id)
1000 video_data, _ = self._extract_web_data_and_status(webpage_url, aweme_id, fatal=False)
1001 sec_uid = self._parse_aweme_video_web(
1002 video_data, webpage_url, aweme_id, extract_flat=True).get('channel_id')
1003 if sec_uid:
1004 break
1005
1006 if not sec_uid:
1007 raise ExtractorError(
1008 'Unable to extract secondary user ID. If you are able to get the channel_id '
1009 'from a video posted by this user, try using "tiktokuser:channel_id" as the '
1010 'input URL (replacing `channel_id` with its actual value)', expected=True)
1011
1012 return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
943d5ab1
M
1013
1014
6368e2e6 1015class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
8126298c
M
1016 def _entries(self, list_id, display_id):
1017 query = {
1018 self._QUERY_NAME: list_id,
1019 'cursor': 0,
1020 'count': 20,
1021 'type': 5,
3584b839 1022 'device_id': self._DEVICE_ID,
8126298c
M
1023 }
1024
8126298c 1025 for page in itertools.count(1):
be5c1ae8 1026 for retry in self.RetryManager():
8126298c 1027 try:
be5c1ae8 1028 post_list = self._call_api(
1029 self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}',
1030 errnote='Unable to download video list')
8126298c 1031 except ExtractorError as e:
be5c1ae8 1032 if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
1033 retry.error = e
8126298c
M
1034 continue
1035 raise
8126298c
M
1036 for video in post_list.get('aweme_list', []):
1037 yield {
1038 **self._parse_aweme_video_app(video),
0b77924a 1039 'extractor_key': TikTokIE.ie_key(),
8126298c
M
1040 'extractor': 'TikTok',
1041 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
1042 }
1043 if not post_list.get('has_more'):
1044 break
1045 query['cursor'] = post_list['cursor']
1046
1047 def _real_extract(self, url):
1048 list_id = self._match_id(url)
1049 return self.playlist_result(self._entries(list_id, list_id), list_id)
1050
1051
1052class TikTokSoundIE(TikTokBaseListIE):
1053 IE_NAME = 'tiktok:sound'
1054 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1055 _WORKING = False
8126298c
M
1056 _QUERY_NAME = 'music_id'
1057 _API_ENDPOINT = 'music/aweme'
1058 _TESTS = [{
1059 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
1060 'playlist_mincount': 100,
1061 'info_dict': {
add96eb9 1062 'id': '6956990112127585029',
8126298c 1063 },
add96eb9 1064 'expected_warnings': ['Retrying'],
8126298c
M
1065 }, {
1066 # Actual entries are less than listed video count
1067 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
1068 'playlist_mincount': 2182,
1069 'info_dict': {
add96eb9 1070 'id': '7036843036118469381',
8126298c 1071 },
add96eb9 1072 'expected_warnings': ['Retrying'],
8126298c
M
1073 }]
1074
1075
1076class TikTokEffectIE(TikTokBaseListIE):
1077 IE_NAME = 'tiktok:effect'
1078 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
f7c5a5e9 1079 _WORKING = False
8126298c
M
1080 _QUERY_NAME = 'sticker_id'
1081 _API_ENDPOINT = 'sticker/aweme'
1082 _TESTS = [{
1083 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
1084 'playlist_mincount': 100,
1085 'info_dict': {
1086 'id': '1258156',
1087 },
add96eb9 1088 'expected_warnings': ['Retrying'],
8126298c
M
1089 }, {
1090 # Different entries between mobile and web, depending on region
1091 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
add96eb9 1092 'only_matching': True,
8126298c
M
1093 }]
1094
1095
1096class TikTokTagIE(TikTokBaseListIE):
1097 IE_NAME = 'tiktok:tag'
1098 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
f7c5a5e9 1099 _WORKING = False
8126298c
M
1100 _QUERY_NAME = 'ch_id'
1101 _API_ENDPOINT = 'challenge/aweme'
1102 _TESTS = [{
1103 'url': 'https://tiktok.com/tag/hello2018',
1104 'playlist_mincount': 39,
1105 'info_dict': {
1106 'id': '46294678',
1107 'title': 'hello2018',
1108 },
add96eb9 1109 'expected_warnings': ['Retrying'],
8126298c
M
1110 }, {
1111 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
add96eb9 1112 'only_matching': True,
8126298c
M
1113 }]
1114
1115 def _real_extract(self, url):
1116 display_id = self._match_id(url)
1117 webpage = self._download_webpage(url, display_id, headers={
add96eb9 1118 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
8126298c
M
1119 })
1120 tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
1121 return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
1122
1123
119d41f2 1124class TikTokCollectionIE(TikTokBaseIE):
1125 IE_NAME = 'tiktok:collection'
1126 _VALID_URL = r'https?://www\.tiktok\.com/@(?P<user_id>[\w.-]+)/collection/(?P<title>[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)'
1127 _TESTS = [{
1128 # playlist should have exactly 9 videos
1129 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462',
1130 'info_dict': {
1131 'id': '7371330159376370462',
add96eb9 1132 'title': 'imanoreotwe-count-test',
119d41f2 1133 },
add96eb9 1134 'playlist_count': 9,
119d41f2 1135 }, {
1136 # tests returning multiple pages of a large collection
1137 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875',
1138 'info_dict': {
1139 'id': '7111887189571160875',
add96eb9 1140 'title': 'imanoreotwe-%F0%9F%98%82',
119d41f2 1141 },
add96eb9 1142 'playlist_mincount': 100,
119d41f2 1143 }]
1144 _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/'
1145 _PAGE_COUNT = 30
1146
1147 def _build_web_query(self, collection_id, cursor):
1148 return {
1149 'aid': '1988',
1150 'collectionId': collection_id,
1151 'count': self._PAGE_COUNT,
1152 'cursor': cursor,
1153 'sourceType': '113',
1154 }
1155
1156 def _entries(self, collection_id):
1157 cursor = 0
1158 for page in itertools.count(1):
1159 response = self._download_json(
1160 self._API_BASE_URL, collection_id, f'Downloading page {page}',
1161 query=self._build_web_query(collection_id, cursor))
1162
1163 for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
1164 video_id = video['id']
1165 author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_'
1166 webpage_url = self._create_url(author, video_id)
1167 yield self.url_result(
1168 webpage_url, TikTokIE,
1169 **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
1170
1171 if not traverse_obj(response, 'hasMore'):
1172 break
1173 cursor += self._PAGE_COUNT
1174
1175 def _real_extract(self, url):
1176 collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id')
1177
1178 return self.playlist_result(
1179 self._entries(collection_id), collection_id, '-'.join((user_name, title)))
1180
1181
ba723997 1182class DouyinIE(TikTokBaseIE):
943d5ab1
M
1183 _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
1184 _TESTS = [{
1185 'url': 'https://www.douyin.com/video/6961737553342991651',
9ff94664 1186 'md5': '9ecce7bc5b302601018ecb2871c63a75',
943d5ab1
M
1187 'info_dict': {
1188 'id': '6961737553342991651',
1189 'ext': 'mp4',
1190 'title': '#杨超越 小小水手带你去远航❤️',
ba723997 1191 'description': '#杨超越 小小水手带你去远航❤️',
9ff94664 1192 'uploader': '6897520xka',
943d5ab1 1193 'uploader_id': '110403406559',
ba723997 1194 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1195 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1196 'channel': '杨超越',
9ff94664 1197 'duration': 19,
ba723997 1198 'timestamp': 1620905839,
1199 'upload_date': '20210513',
1200 'track': '@杨超越创作的原声',
9ff94664 1201 'artists': ['杨超越'],
943d5ab1
M
1202 'view_count': int,
1203 'like_count': int,
1204 'repost_count': int,
1205 'comment_count': int,
92593690 1206 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1207 },
943d5ab1
M
1208 }, {
1209 'url': 'https://www.douyin.com/video/6982497745948921092',
9ff94664 1210 'md5': '15c5e660b7048af3707304e3cc02bbb5',
943d5ab1
M
1211 'info_dict': {
1212 'id': '6982497745948921092',
1213 'ext': 'mp4',
1214 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
ba723997 1215 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
9ff94664 1216 'uploader': '0731chaoyue',
943d5ab1 1217 'uploader_id': '408654318141572',
ba723997 1218 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
92593690 1219 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
f4f9f6d0 1220 'channel': '杨超越工作室',
9ff94664 1221 'duration': 42,
ba723997 1222 'timestamp': 1625739481,
1223 'upload_date': '20210708',
1224 'track': '@杨超越工作室创作的原声',
9ff94664 1225 'artists': ['杨超越工作室'],
943d5ab1
M
1226 'view_count': int,
1227 'like_count': int,
1228 'repost_count': int,
1229 'comment_count': int,
92593690 1230 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1231 },
943d5ab1
M
1232 }, {
1233 'url': 'https://www.douyin.com/video/6953975910773099811',
9ff94664 1234 'md5': '0e6443758b8355db9a3c34864a4276be',
943d5ab1
M
1235 'info_dict': {
1236 'id': '6953975910773099811',
1237 'ext': 'mp4',
1238 'title': '#一起看海 出现在你的夏日里',
ba723997 1239 'description': '#一起看海 出现在你的夏日里',
9ff94664 1240 'uploader': '6897520xka',
943d5ab1 1241 'uploader_id': '110403406559',
ba723997 1242 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1243 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1244 'channel': '杨超越',
9ff94664 1245 'duration': 17,
ba723997 1246 'timestamp': 1619098692,
1247 'upload_date': '20210422',
1248 'track': '@杨超越创作的原声',
9ff94664 1249 'artists': ['杨超越'],
943d5ab1
M
1250 'view_count': int,
1251 'like_count': int,
1252 'repost_count': int,
1253 'comment_count': int,
92593690 1254 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1255 },
943d5ab1
M
1256 }, {
1257 'url': 'https://www.douyin.com/video/6950251282489675042',
1258 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
1259 'info_dict': {
1260 'id': '6950251282489675042',
1261 'ext': 'mp4',
1262 'title': '哈哈哈,成功了哈哈哈哈哈哈',
1263 'uploader': '杨超越',
1264 'upload_date': '20210412',
1265 'timestamp': 1618231483,
1266 'uploader_id': '110403406559',
1267 'view_count': int,
1268 'like_count': int,
1269 'repost_count': int,
1270 'comment_count': int,
ba723997 1271 },
1272 'skip': 'No longer available',
943d5ab1
M
1273 }, {
1274 'url': 'https://www.douyin.com/video/6963263655114722595',
9ff94664 1275 'md5': '1440bcf59d8700f8e014da073a4dfea8',
943d5ab1
M
1276 'info_dict': {
1277 'id': '6963263655114722595',
1278 'ext': 'mp4',
1279 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
ba723997 1280 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
9ff94664 1281 'uploader': '6897520xka',
943d5ab1 1282 'uploader_id': '110403406559',
ba723997 1283 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
92593690 1284 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
f4f9f6d0 1285 'channel': '杨超越',
9ff94664 1286 'duration': 15,
ba723997 1287 'timestamp': 1621261163,
1288 'upload_date': '20210517',
1289 'track': '@杨超越创作的原声',
9ff94664 1290 'artists': ['杨超越'],
943d5ab1
M
1291 'view_count': int,
1292 'like_count': int,
1293 'repost_count': int,
1294 'comment_count': int,
92593690 1295 'thumbnail': r're:https?://.+\.jpe?g',
ba723997 1296 },
943d5ab1 1297 }]
943d5ab1 1298 _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
53dad39e 1299 _WEBPAGE_HOST = 'https://www.douyin.com/'
943d5ab1
M
1300
1301 def _real_extract(self, url):
1302 video_id = self._match_id(url)
1303
9ff94664 1304 detail = traverse_obj(self._download_json(
1305 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
1306 'Downloading web detail JSON', 'Failed to download web detail JSON',
1307 query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
1308 if not detail:
943d5ab1 1309 # TODO: Run verification challenge code to generate signature cookies
ba723997 1310 raise ExtractorError(
9ff94664 1311 'Fresh cookies (not necessarily logged in) are needed',
1312 expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
943d5ab1 1313
9ff94664 1314 return self._parse_aweme_video_app(detail)
88afe056 1315
1316
49895f06 1317class TikTokVMIE(InfoExtractor):
ba723997 1318 _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
88afe056 1319 IE_NAME = 'vm.tiktok'
1320
49895f06 1321 _TESTS = [{
ba723997 1322 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
49895f06 1323 'info_dict': {
ba723997 1324 'id': '7170520270497680683',
49895f06 1325 'ext': 'mp4',
ba723997 1326 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
1327 'uploader_id': '6687535061741700102',
1328 'upload_date': '20221127',
49895f06 1329 'view_count': int,
ba723997 1330 'like_count': int,
49895f06 1331 'comment_count': int,
ba723997 1332 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
1333 'album': 'Wave of Mutilation: Best of Pixies',
1334 'thumbnail': r're:https://.+\.webp.*',
1335 'duration': 5,
1336 'timestamp': 1669516858,
49895f06 1337 'repost_count': int,
ba723997 1338 'artist': 'Pixies',
1339 'track': 'Where Is My Mind?',
1340 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
1341 'uploader': 'sigmachaddeus',
1342 'creator': 'SigmaChad',
1343 },
1344 }, {
c4cbd3be 1345 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
1346 'info_dict': {
1347 'id': '7106798200794926362',
1348 'ext': 'mp4',
1349 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
1350 'uploader_id': '6997695878846268418',
1351 'upload_date': '20220608',
1352 'view_count': int,
1353 'like_count': int,
1354 'comment_count': int,
1355 'thumbnail': r're:https://.+\.webp.*',
1356 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
1357 'duration': 29,
1358 'timestamp': 1654680400,
1359 'repost_count': int,
1360 'artist': 'Akihitoko',
1361 'track': 'original sound',
1362 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
1363 'uploader': 'akihitoko1',
1364 'creator': 'Akihitoko',
1365 },
49895f06 1366 }, {
1367 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
1368 'only_matching': True,
1369 }]
1370
88afe056 1371 def _real_extract(self, url):
11e1c2e3 1372 new_url = self._request_webpage(
3d2623a8 1373 HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
11e1c2e3 1374 if self.suitable(new_url): # Prevent infinite loop in case redirect fails
1375 raise UnsupportedError(new_url)
1376 return self.url_result(new_url)
933ed882
JC
1377
1378
216bcb66 1379class TikTokLiveIE(TikTokBaseIE):
1380 _VALID_URL = r'''(?x)https?://(?:
1381 (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
1382 m\.tiktok\.com/share/live/(?P<id>\d+)
1383 )'''
933ed882
JC
1384 IE_NAME = 'tiktok:live'
1385
1386 _TESTS = [{
216bcb66 1387 'url': 'https://www.tiktok.com/@weathernewslive/live',
1388 'info_dict': {
1389 'id': '7210809319192726273',
1390 'ext': 'mp4',
1391 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
1392 'creator': 'ウェザーニュースLiVE',
1393 'uploader': 'weathernewslive',
1394 'uploader_id': '6621496731283095554',
1395 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
1396 'live_status': 'is_live',
1397 'concurrent_view_count': int,
1398 },
1399 'params': {'skip_download': 'm3u8'},
1400 }, {
1401 'url': 'https://www.tiktok.com/@pilarmagenta/live',
1402 'info_dict': {
1403 'id': '7209423610325322522',
1404 'ext': 'mp4',
1405 'title': str,
1406 'creator': 'Pilarmagenta',
1407 'uploader': 'pilarmagenta',
1408 'uploader_id': '6624846890674683909',
1409 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
1410 'live_status': 'is_live',
1411 'concurrent_view_count': int,
1412 },
1413 'skip': 'Livestream',
1414 }, {
1415 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
1416 'only_matching': True,
1417 }, {
933ed882
JC
1418 'url': 'https://www.tiktok.com/@iris04201/live',
1419 'only_matching': True,
1420 }]
1421
216bcb66 1422 def _call_api(self, url, param, room_id, uploader, key=None):
1423 response = traverse_obj(self._download_json(
1424 url, room_id, fatal=False, query={
1425 'aid': '1988',
1426 param: room_id,
1427 }), (key, {dict}), default={})
1428
1429 # status == 2 if live else 4
1430 if int_or_none(response.get('status')) == 2:
1431 return response
1432 # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
1433 elif not uploader:
1434 raise ExtractorError('This livestream has ended', expected=True)
1435 raise UserNotLive(video_id=uploader)
1436
933ed882 1437 def _real_extract(self, url):
216bcb66 1438 uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
1439 webpage = self._download_webpage(
1440 url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
1441
1442 if webpage:
d9b4154c 1443 data = self._get_sigi_state(webpage, uploader or room_id)
216bcb66 1444 room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
1445 or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
1446 or room_id)
1447 uploader = uploader or traverse_obj(
1448 data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
1449 ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
1450
933ed882
JC
1451 if not room_id:
1452 raise UserNotLive(video_id=uploader)
933ed882 1453
216bcb66 1454 formats = []
1455 live_info = self._call_api(
1456 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
1457
1458 get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
1459 parse_inner = lambda x: self._parse_json(x, None)
1460
1461 for quality, stream in traverse_obj(live_info, (
1462 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
1463 {parse_inner}, 'data', {dict}), default={}).items():
1464
1465 sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
1466 'vcodec': ('VCodec', {str}),
1467 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
1468 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
1469 }))
1470
1471 flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
1472 if flv_url:
1473 formats.append({
1474 'url': flv_url,
1475 'ext': 'flv',
1476 'format_id': f'flv-{quality}',
1477 'quality': get_quality(quality),
1478 **sdk_params,
1479 })
1480
1481 hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
1482 if hls_url:
1483 formats.append({
1484 'url': hls_url,
1485 'ext': 'mp4',
1486 'protocol': 'm3u8_native',
1487 'format_id': f'hls-{quality}',
1488 'quality': get_quality(quality),
1489 **sdk_params,
1490 })
1491
1492 def get_vcodec(*keys):
1493 return traverse_obj(live_info, (
1494 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
1495
1496 for stream in ('hls', 'rtmp'):
1497 stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
1498 if stream_url:
1499 formats.append({
1500 'url': stream_url,
1501 'ext': 'mp4' if stream == 'hls' else 'flv',
1502 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
1503 'format_id': f'{stream}-pull',
1504 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
1505 'quality': get_quality('ORIGION'),
1506 })
1507
1508 for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
1509 if not url_or_none(f_url):
1510 continue
1511 formats.append({
1512 'url': f_url,
1513 'ext': 'flv',
1514 'format_id': f'flv-{f_id}'.lower(),
1515 'vcodec': get_vcodec('flv_pull_url_params', f_id),
1516 'quality': get_quality(f_id),
1517 })
1518
1519 # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
1520 if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
1521 live_info = merge_dicts(live_info, self._call_api(
1522 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
1523 if url_or_none(live_info.get('liveUrl')):
1524 formats.append({
1525 'url': live_info['liveUrl'],
1526 'ext': 'mp4',
1527 'protocol': 'm3u8_native',
1528 'format_id': 'hls-fallback',
1529 'vcodec': 'h264',
1530 'quality': get_quality('origin'),
1531 })
1532
1533 uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
933ed882
JC
1534
1535 return {
1536 'id': room_id,
933ed882 1537 'uploader': uploader,
216bcb66 1538 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
933ed882 1539 'is_live': True,
216bcb66 1540 'formats': formats,
1541 '_format_sort_fields': ('quality', 'ext'),
1542 **traverse_obj(live_info, {
1543 'title': 'title',
1544 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
1545 'creator': (('ownerInfo', 'owner'), 'nickname'),
1546 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
1547 }, get_all=False),
933ed882 1548 }