-# coding: utf-8
-from __future__ import unicode_literals
-
import itertools
+import json
import random
import string
import time
-import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlparse
-)
+from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
HEADRequest,
+ LazyList,
+ UnsupportedError,
+ get_element_by_id,
get_first,
int_or_none,
join_nonempty,
- LazyList,
+ qualities,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
try_get,
url_or_none,
- qualities,
)
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
_WEBPAGE_HOST = 'https://www.tiktok.com/'
QUALITIES = ('360p', '540p', '720p', '1080p')
+ _session_initialized = False
+
+ @staticmethod
+ def _create_url(user_id, video_id):
+ return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
+
+ def _get_sigi_state(self, webpage, display_id):
+ return self._parse_json(get_element_by_id(
+ 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id)
+
+ def _real_initialize(self):
+ if self._session_initialized:
+ return
+ self._request_webpage(HEADRequest('https://www.tiktok.com'), None, note='Setting up session', fatal=False)
+ TikTokBaseIE._session_initialized = True
def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
return {
'id': aweme_id,
- 'title': aweme_detail['desc'],
- 'description': aweme_detail['desc'],
+ 'extractor_key': TikTokIE.ie_key(),
+ 'extractor': TikTokIE.IE_NAME,
+ 'webpage_url': self._create_url(author_info.get('uid'), aweme_id),
+ 'title': aweme_detail.get('desc'),
+ 'description': aweme_detail.get('desc'),
'view_count': int_or_none(stats_info.get('play_count')),
'like_count': int_or_none(stats_info.get('digg_count')),
'repost_count': int_or_none(stats_info.get('share_count')),
class TikTokIE(TikTokBaseIE):
- _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)'
+ _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
_TESTS = [{
'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Ysrbeats',
+ 'album': 'Lehanga',
+ 'track': 'Lehanga',
}
}, {
'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
'like_count': int,
'repost_count': int,
'comment_count': int,
+ 'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
+ 'track': 'Big Fun',
}
}, {
# Banned audio, only available on the app
'repost_count': int,
'comment_count': int,
},
- 'expected_warnings': ['Video not available']
+ 'expected_warnings': ['trying with webpage', 'Unable to find video in feed']
+ }, {
+ # Video without title and description
+ 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
+ 'info_dict': {
+ 'id': '7059698374567611694',
+ 'ext': 'mp4',
+ 'title': 'TikTok video #7059698374567611694',
+ 'description': '',
+ 'uploader': 'pokemonlife22',
+ 'creator': 'Pokemon',
+ 'uploader_id': '6820838815978423302',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
+ 'track': 'original sound',
+ 'timestamp': 1643714123,
+ 'duration': 6,
+ 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
+ 'upload_date': '20220201',
+ 'artist': 'Pokemon',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ }, {
+ # hydration JSON is sent in a <script> element
+ 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
+ 'info_dict': {
+ 'id': '7065799023130643713',
+ 'ext': 'mp4',
+ 'title': '#denidil#денидил',
+ 'description': '#denidil#денидил',
+ 'uploader': 'denidil6',
+ 'uploader_id': '7046664115636405250',
+ 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
+ 'artist': 'Holocron Music',
+ 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
+ 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
+ 'timestamp': 1645134536,
+ 'duration': 26,
+ 'upload_date': '20220217',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ 'expected_warnings': ['trying feed workaround', 'Unable to find video in feed']
}, {
# Auto-captions available
'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
if not aweme_detail:
raise ExtractorError('Video not available', video_id=aweme_id)
except ExtractorError as e:
- self.report_warning(f'{e}; Retrying with feed workaround')
+ self.report_warning(f'{e.orig_msg}; trying feed workaround')
feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None)
return self._parse_aweme_video_app(aweme_detail)
def _real_extract(self, url):
- video_id = self._match_id(url)
-
+ video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
- self.report_warning(f'{e}; Retrying with webpage')
+ self.report_warning(f'{e}; trying with webpage')
- # If we only call once, we get a 403 when downlaoding the video.
- self._download_webpage(url, video_id)
- webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
+ url = self._create_url(user_id, video_id)
+ webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'})
next_data = self._search_nextjs_data(webpage, video_id, default='{}')
-
if next_data:
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict)
else:
- sigi_json = self._search_regex(
- r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});',
- webpage, 'sigi data', group='sigi_state')
- sigi_data = self._parse_json(sigi_json, video_id)
+ sigi_data = self._get_sigi_state(webpage, video_id)
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict)
'thumbnail': r're:https://.+_1080x1080\.webp'
},
'expected_warnings': ['Retrying']
+ }, {
+ 'url': 'https://www.tiktok.com/@6820838815978423302',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': '6820838815978423302',
+ 'title': '6820838815978423302',
+ 'thumbnail': r're:https://.+_1080x1080\.webp'
+ },
+ 'expected_warnings': ['Retrying']
}, {
'url': 'https://www.tiktok.com/@meme',
'playlist_mincount': 593,
webpage = self._download_webpage(url, user_name, headers={
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
})
- user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
+ user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name
videos = LazyList(self._video_entries_api(webpage, user_id, user_name))
thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0))
try:
return self._extract_aweme_app(video_id)
except ExtractorError as e:
- self.report_warning(f'{e}; Retrying with webpage')
+ self.report_warning(f'{e}; trying with webpage')
webpage = self._download_webpage(url, video_id)
render_data_json = self._search_regex(
}]
def _real_extract(self, url):
- return self.url_result(self._request_webpage(
- HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl(), TikTokIE)
+ new_url = self._request_webpage(
+ HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl()
+ if self.suitable(new_url): # Prevent infinite loop in case redirect fails
+ raise UnsupportedError(new_url)
+ return self.url_result(new_url)