]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/soundcloud.py
[cleanup] misc
[yt-dlp.git] / yt_dlp / extractor / soundcloud.py
index 8f0713e134f0ac53672a60bb8668a84109d0451f..412331e17c7d699a410cc9c81e5e64fb9accae9d 100644 (file)
@@ -14,7 +14,6 @@
     compat_HTTPError,
     compat_kwargs,
     compat_str,
-    compat_urlparse,
 )
 from ..utils import (
     error_to_compat_str,
@@ -24,6 +23,8 @@
     int_or_none,
     KNOWN_EXTENSIONS,
     mimetype2ext,
+    remove_end,
+    parse_qs,
     str_or_none,
     try_get,
     unified_timestamp,
@@ -49,8 +50,7 @@ def _extract_urls(webpage):
             webpage)]
 
     def _real_extract(self, url):
-        query = compat_urlparse.parse_qs(
-            compat_urlparse.urlparse(url).query)
+        query = parse_qs(url)
         api_url = query['url'][0]
         secret_token = query.get('secret_token')
         if secret_token:
@@ -305,7 +305,7 @@ def _download_json(self, *args, **kwargs):
                 raise
 
     def _real_initialize(self):
-        self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'fSSdm5yTnDka1g0Fz1CO5Yx6z0NbeHAj'
+        self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
         self._login()
 
     _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
@@ -656,64 +656,60 @@ def _real_extract(self, url):
 
 class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
     def _extract_playlist(self, base_url, playlist_id, playlist_title):
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_title,
+            'entries': self._entries(base_url, playlist_id),
+        }
+
+    def _entries(self, url, playlist_id):
         # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
         # https://developers.soundcloud.com/blog/offset-pagination-deprecated
-        COMMON_QUERY = {
+        query = {
             'limit': 200,
             'linked_partitioning': '1',
+            'offset': 0,
         }
 
-        query = COMMON_QUERY.copy()
-        query['offset'] = 0
-
-        next_href = base_url
+        retries = self.get_param('extractor_retries', 3)
 
-        entries = []
         for i in itertools.count():
-            response = self._download_json(
-                next_href, playlist_id,
-                'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS)
-
-            collection = response['collection']
-
-            if not isinstance(collection, list):
-                collection = []
-
-            # Empty collection may be returned, in this case we proceed
-            # straight to next_href
-
-            def resolve_entry(candidates):
+            attempt, last_error = -1, None
+            while attempt < retries:
+                attempt += 1
+                if last_error:
+                    self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'), playlist_id)
+                try:
+                    response = self._download_json(
+                        url, playlist_id, query=query, headers=self._HEADERS,
+                        note='Downloading track page %s%s' % (i + 1, f' (retry #{attempt})' if attempt else ''))
+                    break
+                except ExtractorError as e:
+                    # Downloading page may result in intermittent 502 HTTP error
+                    # See https://github.com/yt-dlp/yt-dlp/issues/872
+                    if attempt >= retries or not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502:
+                        raise
+                    last_error = str(e.cause or e.msg)
+
+            def resolve_entry(*candidates):
                 for cand in candidates:
                     if not isinstance(cand, dict):
                         continue
                     permalink_url = url_or_none(cand.get('permalink_url'))
-                    if not permalink_url:
-                        continue
-                    return self.url_result(
-                        permalink_url,
-                        SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
-                        str_or_none(cand.get('id')), cand.get('title'))
-
-            for e in collection:
-                entry = resolve_entry((e, e.get('track'), e.get('playlist')))
-                if entry:
-                    entries.append(entry)
-
-            next_href = response.get('next_href')
-            if not next_href:
-                break
+                    if permalink_url:
+                        return self.url_result(
+                            permalink_url,
+                            SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
+                            str_or_none(cand.get('id')), cand.get('title'))
 
-            next_href = response['next_href']
-            parsed_next_href = compat_urlparse.urlparse(next_href)
-            query = compat_urlparse.parse_qs(parsed_next_href.query)
-            query.update(COMMON_QUERY)
+            for e in response['collection'] or []:
+                yield resolve_entry(e, e.get('track'), e.get('playlist'))
 
-        return {
-            '_type': 'playlist',
-            'id': playlist_id,
-            'title': playlist_title,
-            'entries': entries,
-        }
+            url = response.get('next_href')
+            if not url:
+                break
+            query.pop('offset', None)
 
 
 class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
@@ -859,7 +855,7 @@ def _real_extract(self, url):
 
 class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
     IE_NAME = 'soundcloud:search'
-    IE_DESC = 'Soundcloud search'
+    IE_DESC = 'Soundcloud search, "scsearch" keyword'
     _MAX_RESULTS = float('inf')
     _TESTS = [{
         'url': 'scsearch15:post-avant jazzcore',
@@ -884,25 +880,14 @@ def _get_collection(self, endpoint, collection_id, **query):
         })
         next_url = update_url_query(self._API_V2_BASE + endpoint, query)
 
-        collected_results = 0
-
         for i in itertools.count(1):
             response = self._download_json(
-                next_url, collection_id, 'Downloading page {0}'.format(i),
+                next_url, collection_id, f'Downloading page {i}',
                 'Unable to download API page', headers=self._HEADERS)
 
-            collection = response.get('collection', [])
-            if not collection:
-                break
-
-            collection = list(filter(bool, collection))
-            collected_results += len(collection)
-
-            for item in collection:
-                yield self.url_result(item['uri'], SoundcloudIE.ie_key())
-
-            if not collection or collected_results >= limit:
-                break
+            for item in response.get('collection') or []:
+                if item:
+                    yield self.url_result(item['uri'], SoundcloudIE.ie_key())
 
             next_url = response.get('next_href')
             if not next_url:
@@ -910,4 +895,4 @@ def _get_collection(self, endpoint, collection_id, **query):
 
     def _get_n_results(self, query, n):
         tracks = self._get_collection('search/tracks', query, limit=n, q=query)
-        return self.playlist_result(tracks, playlist_title=query)
+        return self.playlist_result(tracks, query, query)