yt_dlp/extractor/tiktok.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     ExtractorError,
   9     int_or_none,
  10     str_or_none,
  11     try_get
  12 )
  13
  14
  15 class TikTokIE(InfoExtractor):
  16     _VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P<id>\d+)'
  17
  18     _TESTS = [{
  19         'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
  20         'md5': '34a7543afd5a151b0840ba6736fb633b',
  21         'info_dict': {
  22             'id': '6748451240264420610',
  23             'ext': 'mp4',
  24             'title': '#jassmanak #lehanga #leenabhushan',
  25             'description': '#jassmanak #lehanga #leenabhushan',
  26             'duration': 13,
  27             'height': 1280,
  28             'width': 720,
  29             'uploader': 'leenabhushan',
  30             'uploader_id': '6691488002098119685',
  31             'uploader_url': 'https://www.tiktok.com/@leenabhushan',
  32             'creator': 'facestoriesbyleenabh',
  33             'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  34             'upload_date': '20191016',
  35             'timestamp': 1571246252,
  36             'view_count': int,
  37             'like_count': int,
  38             'repost_count': int,
  39             'comment_count': int,
  40         }
  41     }, {
  42         'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
  43         'md5': '06b9800d47d5fe51a19e322dd86e61c9',
  44         'info_dict': {
  45             'id': '6742501081818877190',
  46             'ext': 'mp4',
  47             'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
  48             'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
  49             'duration': 27,
  50             'height': 960,
  51             'width': 540,
  52             'uploader': 'patrox',
  53             'uploader_id': '18702747',
  54             'uploader_url': 'https://www.tiktok.com/@patrox',
  55             'creator': 'patroX',
  56             'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  57             'upload_date': '20190930',
  58             'timestamp': 1569860870,
  59             'view_count': int,
  60             'like_count': int,
  61             'repost_count': int,
  62             'comment_count': int,
  63         }
  64     }]
  65
  66     def _extract_aweme(self, props_data, webpage, url):
  67         video_info = try_get(
  68             props_data, lambda x: x['pageProps']['itemInfo']['itemStruct'], dict)
  69         author_info = try_get(
  70             props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['author'], dict) or {}
  71         stats_info = try_get(props_data, lambda x: x['pageProps']['itemInfo']['itemStruct']['stats'], dict) or {}
  72
  73         user_id = str_or_none(author_info.get('uniqueId'))
  74         download_url = try_get(video_info, (lambda x: x['video']['playAddr'],
  75                                             lambda x: x['video']['downloadAddr']))
  76         height = try_get(video_info, lambda x: x['video']['height'], int)
  77         width = try_get(video_info, lambda x: x['video']['width'], int)
  78         thumbnails = [{
  79             'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage),
  80             'width': width,
  81             'height': height
  82         }]
  83         tracker = try_get(props_data, lambda x: x['initialProps']['$wid'])
  84
  85         return {
  86             'id': str_or_none(video_info.get('id')),
  87             'url': download_url,
  88             'ext': 'mp4',
  89             'height': height,
  90             'width': width,
  91             'title': video_info.get('desc') or self._og_search_title(webpage),
  92             'duration': try_get(video_info, lambda x: x['video']['duration'], int),
  93             'view_count': int_or_none(stats_info.get('playCount')),
  94             'like_count': int_or_none(stats_info.get('diggCount')),
  95             'repost_count': int_or_none(stats_info.get('shareCount')),
  96             'comment_count': int_or_none(stats_info.get('commentCount')),
  97             'timestamp': try_get(video_info, lambda x: int(x['createTime']), int),
  98             'creator': str_or_none(author_info.get('nickname')),
  99             'uploader': user_id,
 100             'uploader_id': str_or_none(author_info.get('id')),
 101             'uploader_url': f'https://www.tiktok.com/@{user_id}',
 102             'thumbnails': thumbnails,
 103             'description': str_or_none(video_info.get('desc')),
 104             'webpage_url': self._og_search_url(webpage),
 105             'http_headers': {
 106                 'Referer': url,
 107                 'Cookie': 'tt_webid=%s; tt_webid_v2=%s' % (tracker, tracker),
 108             }
 109         }
 110
 111     def _real_extract(self, url):
 112         video_id = self._match_id(url)
 113
 114         # If we only call once, we get a 403 when downlaoding the video.
 115         self._download_webpage(url, video_id)
 116         webpage = self._download_webpage(url, video_id, note='Downloading video webpage')
 117         json_string = self._search_regex(
 118             r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)',
 119             webpage, 'json_string', group='json_string_ld')
 120         json_data = self._parse_json(json_string, video_id)
 121         props_data = try_get(json_data, lambda x: x['props'], expected_type=dict)
 122
 123         # Chech statusCode for success
 124         status = props_data.get('pageProps').get('statusCode')
 125         if status == 0:
 126             return self._extract_aweme(props_data, webpage, url)
 127         elif status == 10216:
 128             raise ExtractorError('This video is private', expected=True)
 129
 130         raise ExtractorError('Video not available', video_id=video_id)
 131
 132
 133 class TikTokUserIE(InfoExtractor):
 134     IE_NAME = 'tiktok:user'
 135     _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\._]+)/?(?:$|[#?])'
 136     _TESTS = [{
 137         'url': 'https://tiktok.com/@corgibobaa?lang=en',
 138         'playlist_mincount': 45,
 139         'info_dict': {
 140             'id': '6935371178089399301',
 141         },
 142         'skip': 'Cookies (not necessarily logged in) are needed.'
 143     }, {
 144         'url': 'https://www.tiktok.com/@meme',
 145         'playlist_mincount': 593,
 146         'info_dict': {
 147             'id': '79005827461758976',
 148         },
 149         'skip': 'Cookies (not necessarily logged in) are needed.'
 150     }]
 151
 152     def _entries(self, url, user_id):
 153         webpage = self._download_webpage(url, user_id)
 154         own_id = self._search_regex(r'\"id\":\"(?P<userid>\d+)', webpage, user_id, default=None)
 155         if not own_id:
 156             raise ExtractorError('Cookies (not necessarily logged in) are needed.', expected=True)
 157         secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, user_id)
 158         verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id')
 159         if not verifyfp_cookie:
 160             raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True)
 161         api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor='
 162         cursor = '0'
 163         for page in itertools.count():
 164             data_json = self._download_json(api_url + cursor, user_id, note='Downloading Page %d' % page)
 165             for video in data_json.get('itemList', []):
 166                 video_id = video['id']
 167                 video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}'
 168                 download_url = try_get(video, (lambda x: x['video']['playAddr'],
 169                                                lambda x: x['video']['downloadAddr']))
 170                 thumbnail = try_get(video, lambda x: x['video']['originCover'])
 171                 height = try_get(video, lambda x: x['video']['height'], int)
 172                 width = try_get(video, lambda x: x['video']['width'], int)
 173                 yield {
 174                     'id': video_id,
 175                     'ie_key': TikTokIE.ie_key(),
 176                     'extractor': 'TikTok',
 177                     'url': download_url,
 178                     'ext': 'mp4',
 179                     'height': height,
 180                     'width': width,
 181                     'title': str_or_none(video.get('desc')),
 182                     'duration': try_get(video, lambda x: x['video']['duration'], int),
 183                     'view_count': try_get(video, lambda x: x['stats']['playCount'], int),
 184                     'like_count': try_get(video, lambda x: x['stats']['diggCount'], int),
 185                     'comment_count': try_get(video, lambda x: x['stats']['commentCount'], int),
 186                     'repost_count': try_get(video, lambda x: x['stats']['shareCount'], int),
 187                     'timestamp': video.get('createTime'),
 188                     'creator': try_get(video, lambda x: x['author']['nickname'], str),
 189                     'uploader': try_get(video, lambda x: x['author']['uniqueId'], str),
 190                     'uploader_id': try_get(video, lambda x: x['author']['id'], str),
 191                     'uploader_url': f'https://www.tiktok.com/@{user_id}',
 192                     'thumbnails': [{'url': thumbnail, 'height': height, 'width': width}],
 193                     'description': str_or_none(video.get('desc')),
 194                     'webpage_url': video_url,
 195                     'http_headers': {
 196                         'Referer': video_url,
 197                     }
 198                 }
 199             if not data_json.get('hasMore'):
 200                 break
 201             cursor = data_json['cursor']
 202
 203     def _real_extract(self, url):
 204         user_id = self._match_id(url)
 205         return self.playlist_result(self._entries(url, user_id), user_id)