[Drooble] Add extractor (#1547)

[yt-dlp.git] / yt_dlp / extractor / tiktok.py
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py

index fc0915fb0210e5424dd9383f164fe4c146835900..c34235e960d39a9ce8db681951982dd6ad89b04c 100644 (file)
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -12,6 +12,7 @@
  from ..utils import (
      ExtractorError,
      int_or_none,
+    join_nonempty,
      str_or_none,
      traverse_obj,
      try_get,
@@ -25,8 +26,9 @@ class TikTokBaseIE(InfoExtractor):
      _MANIFEST_APP_VERSION = '291'
      _APP_NAME = 'trill'
      _AID = 1180
-    _API_HOSTNAME = 'api-t2.tiktokv.com'
+    _API_HOSTNAME = 'api-h2.tiktokv.com'
      _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
+    _WEBPAGE_HOST = 'https://www.tiktok.com/'
      QUALITIES = ('360p', '540p', '720p')
  
      def _call_api(self, ep, query, video_id, fatal=True,
@@ -38,8 +40,8 @@ def _call_api(self, ep, query, video_id, fatal=True,
              'build_number': self._APP_VERSION,
              'manifest_version_code': self._MANIFEST_APP_VERSION,
              'update_version_code': self._MANIFEST_APP_VERSION,
-            'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)),
-            'uuid': ''.join([random.choice(string.digits) for num in range(16)]),
+            'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)),
+            'uuid': ''.join([random.choice(string.digits) for _ in range(16)]),
              '_rticket': int(time.time() * 1000),
              'ts': int(time.time()),
              'device_brand': 'Google',
@@ -66,7 +68,10 @@ def _call_api(self, ep, query, video_id, fatal=True,
              'as': 'a1qwert123',
              'cp': 'cbfhckdckkde1',
          }
-        self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160)))
+        self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160)))
+        webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
+        if webpage_cookies.get('sid_tt'):
+            self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
          return self._download_json(
              'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
              fatal=fatal, note=note, errnote=errnote, headers={
@@ -107,8 +112,8 @@ def extract_addr(addr, add_meta={}):
                  'acodec': 'aac',
                  'source_preference': -2 if 'aweme/v1' in url else -1,  # Downloads from API might get blocked
                  **add_meta, **parsed_meta,
-                'format_note': ' '.join(filter(None, (
-                    add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else '')))
+                'format_note': join_nonempty(
+                    add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ')
              } for url in addr.get('url_list') or []]
  
          # Hack: Add direct video links first to prioritize them when removing duplicate formats
@@ -175,6 +180,7 @@ def extract_addr(addr, add_meta={}):
          user_url = self._UPLOADER_URL_FORMAT % (traverse_obj(author_info,
                                                               'sec_uid', 'id', 'uid', 'unique_id',
                                                               expected_type=str_or_none, get_all=False))
+        labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str, default=[])
  
          contained_music_track = traverse_obj(
              music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
@@ -205,10 +211,14 @@ def extract_addr(addr, add_meta={}):
              'timestamp': int_or_none(aweme_detail.get('create_time')),
              'formats': formats,
              'thumbnails': thumbnails,
-            'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000)
+            'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000),
+            'availability': self._availability(
+                is_private='Private' in labels,
+                needs_subscription='Friends only' in labels,
+                is_unlisted='Followers only' in labels)
          }
  
-    def _parse_aweme_video_web(self, aweme_detail, webpage, url):
+    def _parse_aweme_video_web(self, aweme_detail, webpage_url):
          video_info = aweme_detail['video']
          author_info = traverse_obj(aweme_detail, 'author', 'authorInfo', default={})
          music_info = aweme_detail.get('music') or {}
@@ -277,7 +287,7 @@ def _parse_aweme_video_web(self, aweme_detail, webpage, url):
              'thumbnails': thumbnails,
              'description': str_or_none(aweme_detail.get('desc')),
              'http_headers': {
-                'Referer': url
+                'Referer': webpage_url
              }
          }
  
@@ -287,18 +297,18 @@ class TikTokIE(TikTokBaseIE):
  
      _TESTS = [{
          'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
-        'md5': '34a7543afd5a151b0840ba6736fb633b',
+        'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
          'info_dict': {
              'id': '6748451240264420610',
              'ext': 'mp4',
              'title': '#jassmanak #lehanga #leenabhushan',
              'description': '#jassmanak #lehanga #leenabhushan',
              'duration': 13,
-            'height': 1280,
-            'width': 720,
+            'height': 1024,
+            'width': 576,
              'uploader': 'leenabhushan',
              'uploader_id': '6691488002098119685',
-            'uploader_url': 'https://www.tiktok.com/@leenabhushan',
+            'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
              'creator': 'facestoriesbyleenabh',
              'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
              'upload_date': '20191016',
@@ -310,7 +320,7 @@ class TikTokIE(TikTokBaseIE):
          }
      }, {
          'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
-        'md5': '06b9800d47d5fe51a19e322dd86e61c9',
+        'md5': '6f3cf8cdd9b28cb8363fe0a9a160695b',
          'info_dict': {
              'id': '6742501081818877190',
              'ext': 'mp4',
@@ -321,7 +331,7 @@ class TikTokIE(TikTokBaseIE):
              'width': 540,
              'uploader': 'patrox',
              'uploader_id': '18702747',
-            'uploader_url': 'https://www.tiktok.com/@patrox',
+            'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
              'creator': 'patroX',
              'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
              'upload_date': '20190930',
@@ -338,8 +348,18 @@ class TikTokIE(TikTokBaseIE):
      }]
  
      def _extract_aweme_app(self, aweme_id):
-        aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
-                                      note='Downloading video details', errnote='Unable to download video details')['aweme_detail']
+        try:
+            aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id,
+                                          note='Downloading video details', errnote='Unable to download video details').get('aweme_detail')
+            if not aweme_detail:
+                raise ExtractorError('Video not available', video_id=aweme_id)
+        except ExtractorError as e:
+            self.report_warning(f'{e}; Retrying with feed workaround')
+            feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id,
+                                       note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or []
+            aweme_detail = next(aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id)
+            if not aweme_detail:
+                raise ExtractorError('Unable to find video in feed', video_id=aweme_id)
          return self._parse_aweme_video_app(aweme_detail)
  
      def _real_extract(self, url):
@@ -362,7 +382,7 @@ def _real_extract(self, url):
          # Chech statusCode for success
          status = props_data.get('pageProps').get('statusCode')
          if status == 0:
-            return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], webpage, url)
+            return self._parse_aweme_video_web(props_data['pageProps']['itemInfo']['itemStruct'], url)
          elif status == 10216:
              raise ExtractorError('This video is private', expected=True)
  
@@ -377,13 +397,17 @@ class TikTokUserIE(TikTokBaseIE):
          'playlist_mincount': 45,
          'info_dict': {
              'id': '6935371178089399301',
+            'title': 'corgibobaa',
          },
+        'expected_warnings': ['Retrying']
      }, {
          'url': 'https://www.tiktok.com/@meme',
          'playlist_mincount': 593,
          'info_dict': {
              'id': '79005827461758976',
+            'title': 'meme',
          },
+        'expected_warnings': ['Retrying']
      }]
  
      r'''  # TODO: Fix by adding _signature to api_url
@@ -412,7 +436,7 @@ def _entries_api(self, webpage, user_id, username):
              'max_cursor': 0,
              'min_cursor': 0,
              'retry_type': 'no_retry',
-            'device_id': ''.join(random.choice(string.digits) for i in range(19)),  # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
+            'device_id': ''.join(random.choice(string.digits) for _ in range(19)),  # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api.
          }
  
          max_retries = self.get_param('extractor_retries', 3)
@@ -430,21 +454,130 @@ def _entries_api(self, webpage, user_id, username):
                  break
              for video in post_list.get('aweme_list', []):
                  yield {
-                    **self._parse_aweme_video(video),
-                    'ie_key': TikTokIE.ie_key(),
+                    **self._parse_aweme_video_app(video),
+                    'extractor_key': TikTokIE.ie_key(),
                      'extractor': 'TikTok',
+                    'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}',
                  }
              if not post_list.get('has_more'):
                  break
              query['max_cursor'] = post_list['max_cursor']
  
      def _real_extract(self, url):
-        user_id = self._match_id(url)
-        webpage = self._download_webpage(url, user_id, headers={
+        user_name = self._match_id(url)
+        webpage = self._download_webpage(url, user_name, headers={
+            'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
+        })
+        user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
+        return self.playlist_result(self._entries_api(webpage, user_id, user_name), user_id, user_name)
+
+
+class TikTokBaseListIE(TikTokBaseIE):
+    def _entries(self, list_id, display_id):
+        query = {
+            self._QUERY_NAME: list_id,
+            'cursor': 0,
+            'count': 20,
+            'type': 5,
+            'device_id': ''.join(random.choice(string.digits) for i in range(19))
+        }
+
+        max_retries = self.get_param('extractor_retries', 3)
+        for page in itertools.count(1):
+            for retries in itertools.count():
+                try:
+                    post_list = self._call_api(self._API_ENDPOINT, query, display_id,
+                                               note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''),
+                                               errnote='Unable to download video list')
+                except ExtractorError as e:
+                    if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries:
+                        self.report_warning('%s. Retrying...' % str(e.cause or e.msg))
+                        continue
+                    raise
+                break
+            for video in post_list.get('aweme_list', []):
+                yield {
+                    **self._parse_aweme_video_app(video),
+                    'extractor_key': TikTokIE.ie_key(),
+                    'extractor': 'TikTok',
+                    'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
+                }
+            if not post_list.get('has_more'):
+                break
+            query['cursor'] = post_list['cursor']
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+        return self.playlist_result(self._entries(list_id, list_id), list_id)
+
+
+class TikTokSoundIE(TikTokBaseListIE):
+    IE_NAME = 'tiktok:sound'
+    _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
+    _QUERY_NAME = 'music_id'
+    _API_ENDPOINT = 'music/aweme'
+    _TESTS = [{
+        'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
+        'playlist_mincount': 100,
+        'info_dict': {
+            'id': '6956990112127585029'
+        },
+        'expected_warnings': ['Retrying']
+    }, {
+        # Actual entries are less than listed video count
+        'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
+        'playlist_mincount': 2182,
+        'info_dict': {
+            'id': '7036843036118469381'
+        },
+        'expected_warnings': ['Retrying']
+    }]
+
+
+class TikTokEffectIE(TikTokBaseListIE):
+    IE_NAME = 'tiktok:effect'
+    _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
+    _QUERY_NAME = 'sticker_id'
+    _API_ENDPOINT = 'sticker/aweme'
+    _TESTS = [{
+        'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
+        'playlist_mincount': 100,
+        'info_dict': {
+            'id': '1258156',
+        },
+        'expected_warnings': ['Retrying']
+    }, {
+        # Different entries between mobile and web, depending on region
+        'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
+        'only_matching': True
+    }]
+
+
+class TikTokTagIE(TikTokBaseListIE):
+    IE_NAME = 'tiktok:tag'
+    _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
+    _QUERY_NAME = 'ch_id'
+    _API_ENDPOINT = 'challenge/aweme'
+    _TESTS = [{
+        'url': 'https://tiktok.com/tag/hello2018',
+        'playlist_mincount': 39,
+        'info_dict': {
+            'id': '46294678',
+            'title': 'hello2018',
+        },
+        'expected_warnings': ['Retrying']
+    }, {
+        'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id, headers={
              'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'
          })
-        own_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID')
-        return self.playlist_result(self._entries_api(webpage, own_id, user_id), user_id)
+        tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
+        return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
  
  
  class DouyinIE(TikTokIE):
@@ -536,6 +669,7 @@ class DouyinIE(TikTokIE):
      _AID = 1128
      _API_HOSTNAME = 'aweme.snssdk.com'
      _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
+    _WEBPAGE_HOST = 'https://www.douyin.com/'
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
@@ -556,4 +690,4 @@ def _real_extract(self, url):
          render_data = self._parse_json(
              render_data_json, video_id, transform_source=compat_urllib_parse_unquote)
          return self._parse_aweme_video_web(
-            traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), webpage, url)
+            traverse_obj(render_data, (..., 'aweme', 'detail'), get_all=False), url)