]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/tiktok.py
[cleanup] Misc
[yt-dlp.git] / yt_dlp / extractor / tiktok.py
CommitLineData
1ead840d
KS
1# coding: utf-8
2from __future__ import unicode_literals
f7f18f90
A
3
4import itertools
1ead840d
KS
5
6from .common import InfoExtractor
7from ..utils import (
ce18a19b 8 ExtractorError,
1ead840d
KS
9 int_or_none,
10 str_or_none,
4b6d03ed 11 try_get
1ead840d
KS
12)
13
14
4f8b70b5 15class TikTokIE(InfoExtractor):
6255e567 16 _VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P<id>\d+)'
4b6d03ed 17
7c2ecbc1 18 _TESTS = [{
4b6d03ed
AG
19 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
20 'md5': '34a7543afd5a151b0840ba6736fb633b',
ce18a19b 21 'info_dict': {
4f8b70b5 22 'id': '6748451240264420610',
6fb11ca8 23 'ext': 'mp4',
4f8b70b5 24 'title': '#jassmanak #lehanga #leenabhushan',
25 'description': '#jassmanak #lehanga #leenabhushan',
26 'duration': 13,
4b6d03ed 27 'height': 1280,
4f8b70b5 28 'width': 720,
6fb11ca8 29 'uploader': 'leenabhushan',
4b6d03ed 30 'uploader_id': '6691488002098119685',
4f8b70b5 31 'uploader_url': 'https://www.tiktok.com/@leenabhushan',
32 'creator': 'facestoriesbyleenabh',
33 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
34 'upload_date': '20191016',
35 'timestamp': 1571246252,
36 'view_count': int,
37 'like_count': int,
38 'repost_count': int,
39 'comment_count': int,
ce18a19b 40 }
7c2ecbc1 41 }, {
4b6d03ed
AG
42 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
43 'md5': '06b9800d47d5fe51a19e322dd86e61c9',
44 'info_dict': {
4f8b70b5 45 'id': '6742501081818877190',
46 'ext': 'mp4',
47 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
4b6d03ed
AG
48 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
49 'duration': 27,
4b6d03ed 50 'height': 960,
4f8b70b5 51 'width': 540,
52 'uploader': 'patrox',
53 'uploader_id': '18702747',
54 'uploader_url': 'https://www.tiktok.com/@patrox',
55 'creator': 'patroX',
4b6d03ed 56 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
4b6d03ed 57 'upload_date': '20190930',
4f8b70b5 58 'timestamp': 1569860870,
59 'view_count': int,
60 'like_count': int,
61 'repost_count': int,
62 'comment_count': int,
4b6d03ed 63 }
7c2ecbc1 64 }]
ce18a19b 65
4f8b70b5 66 def _extract_aweme(self, props_data, webpage, url):
67 video_info = try_get(
68 props_data, lambda x: x['pageProps']['itemInfo']['itemStruct'], dict)
69 author_info = try_get(
70 props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['author'], dict) or {}
71 stats_info = try_get(props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['stats'], dict) or {}
72
73 user_id = str_or_none(author_info.get('uniqueId'))
74 download_url = try_get(video_info, (lambda x: x['video']['playAddr'],
f7f18f90 75 lambda x: x['video']['downloadAddr']))
4f8b70b5 76 height = try_get(video_info, lambda x: x['video']['height'], int)
77 width = try_get(video_info, lambda x: x['video']['width'], int)
78 thumbnails = [{
79 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage),
80 'width': width,
81 'height': height
82 }]
83 tracker = try_get(props_data, lambda x: x['initialProps']['$wid'])
84
85 return {
86 'id': str_or_none(video_info.get('id')),
87 'url': download_url,
88 'ext': 'mp4',
89 'height': height,
90 'width': width,
91 'title': video_info.get('desc') or self._og_search_title(webpage),
92 'duration': try_get(video_info, lambda x: x['video']['duration'], int),
93 'view_count': int_or_none(stats_info.get('playCount')),
94 'like_count': int_or_none(stats_info.get('diggCount')),
95 'repost_count': int_or_none(stats_info.get('shareCount')),
96 'comment_count': int_or_none(stats_info.get('commentCount')),
97 'timestamp': try_get(video_info, lambda x: int(x['createTime']), int),
98 'creator': str_or_none(author_info.get('nickname')),
99 'uploader': user_id,
100 'uploader_id': str_or_none(author_info.get('id')),
101 'uploader_url': f'https://www.tiktok.com/@{user_id}',
102 'thumbnails': thumbnails,
103 'description': str_or_none(video_info.get('desc')),
104 'webpage_url': self._og_search_url(webpage),
105 'http_headers': {
106 'Referer': url,
107 'Cookie': 'tt_webid=%s; tt_webid_v2=%s' % (tracker, tracker),
108 }
109 }
110
ce18a19b
S
111 def _real_extract(self, url):
112 video_id = self._match_id(url)
ce18a19b 113
7bbc0bbc 114 # If we only call once, we get a 403 when downlaoding the video.
61e76c1e 115 self._download_webpage(url, video_id)
6fb11ca8 116 webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
4b6d03ed 117 json_string = self._search_regex(
6255e567
AG
118 r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
119 webpage, 'json_string', group='json_string_ld')
4b6d03ed 120 json_data = self._parse_json(json_string, video_id)
4f5a0ad8 121 props_data = try_get(json_data, lambda x: x['props'], expected_type=dict)
ce18a19b 122
4b6d03ed 123 # Chech statusCode for success
1418a043 124 status = props_data.get('pageProps').get('statusCode')
125 if status == 0:
4f5a0ad8 126 return self._extract_aweme(props_data, webpage, url)
1418a043 127 elif status == 10216:
128 raise ExtractorError('This video is private', expected=True)
4b6d03ed 129
6fb11ca8 130 raise ExtractorError('Video not available', video_id=video_id)
f7f18f90
A
131
132
133class TikTokUserIE(InfoExtractor):
134 IE_NAME = 'tiktok:user'
526d74ec 135 _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\._]+)/?(?:$|[#?])'
f7f18f90 136 _TESTS = [{
526d74ec 137 'url': 'https://tiktok.com/@corgibobaa?lang=en',
f7f18f90
A
138 'playlist_mincount': 45,
139 'info_dict': {
140 'id': '6935371178089399301',
141 },
142 'skip': 'Cookies (not necessarily logged in) are needed.'
143 }, {
144 'url': 'https://www.tiktok.com/@meme',
145 'playlist_mincount': 593,
146 'info_dict': {
147 'id': '79005827461758976',
148 },
149 'skip': 'Cookies (not necessarily logged in) are needed.'
150 }]
151
152 def _entries(self, url, user_id):
153 webpage = self._download_webpage(url, user_id)
154 own_id = self._search_regex(r'\"id\":\"(?P<userid>\d+)', webpage, user_id, default=None)
155 if not own_id:
156 raise ExtractorError('Cookies (not necessarily logged in) are needed.', expected=True)
157 secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, user_id)
158 verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id')
159 if not verifyfp_cookie:
160 raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True)
161 api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor='
162 cursor = '0'
163 for page in itertools.count():
164 data_json = self._download_json(api_url + cursor, user_id, note='Downloading Page %d' % page)
165 for video in data_json.get('itemList', []):
166 video_id = video['id']
167 video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}'
168 download_url = try_get(video, (lambda x: x['video']['playAddr'],
169 lambda x: x['video']['downloadAddr']))
170 thumbnail = try_get(video, lambda x: x['video']['originCover'])
171 height = try_get(video, lambda x: x['video']['height'], int)
172 width = try_get(video, lambda x: x['video']['width'], int)
173 yield {
174 'id': video_id,
175 'ie_key': TikTokIE.ie_key(),
176 'extractor': 'TikTok',
177 'url': download_url,
178 'ext': 'mp4',
179 'height': height,
180 'width': width,
181 'title': str_or_none(video.get('desc')),
182 'duration': try_get(video, lambda x: x['video']['duration'], int),
183 'view_count': try_get(video, lambda x: x['stats']['playCount'], int),
184 'like_count': try_get(video, lambda x: x['stats']['diggCount'], int),
185 'comment_count': try_get(video, lambda x: x['stats']['commentCount'], int),
186 'repost_count': try_get(video, lambda x: x['stats']['shareCount'], int),
187 'timestamp': video.get('createTime'),
188 'creator': try_get(video, lambda x: x['author']['nickname'], str),
189 'uploader': try_get(video, lambda x: x['author']['uniqueId'], str),
190 'uploader_id': try_get(video, lambda x: x['author']['id'], str),
191 'uploader_url': f'https://www.tiktok.com/@{user_id}',
192 'thumbnails': [{'url': thumbnail, 'height': height, 'width': width}],
193 'description': str_or_none(video.get('desc')),
194 'webpage_url': video_url,
195 'http_headers': {
196 'Referer': video_url,
197 }
198 }
526d74ec 199 if not data_json.get('hasMore'):
f7f18f90
A
200 break
201 cursor = data_json['cursor']
202
203 def _real_extract(self, url):
204 user_id = self._match_id(url)
205 return self.playlist_result(self._entries(url, user_id), user_id)