]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/generic.py
[extractor] Deprecate `_sort_formats`
[yt-dlp.git] / yt_dlp / extractor / generic.py
index 828c8a6cff4ef1a8fadf0ee4aa988ea517dc2e8c..85581e62280738fd87a5ecb0469e90b7633d82d3 100644 (file)
@@ -1,5 +1,6 @@
 import os
 import re
+import types
 import urllib.parse
 import xml.etree.ElementTree
 
@@ -31,6 +32,7 @@
     unified_timestamp,
     unsmuggle_url,
     url_or_none,
+    variadic,
     xpath_attr,
     xpath_text,
     xpath_with_ns,
@@ -873,22 +875,6 @@ class GenericIE(InfoExtractor):
                 'thumbnail': r're:^https?://.*\.jpg$',
             },
         },
-        # Wistia embed
-        {
-            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
-            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
-            'info_dict': {
-                'id': '6e2wtrbdaf',
-                'ext': 'mov',
-                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
-                'description': 'a Paywall Videos video from Remilon',
-                'duration': 644.072,
-                'uploader': 'study.com',
-                'timestamp': 1459678540,
-                'upload_date': '20160403',
-                'filesize': 24687186,
-            },
-        },
         # Wistia standard embed (async)
         {
             'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
@@ -903,7 +889,8 @@ class GenericIE(InfoExtractor):
             },
             'params': {
                 'skip_download': True,
-            }
+            },
+            'skip': 'webpage 404 not found',
         },
         # Soundcloud embed
         {
@@ -1086,18 +1073,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             }
         },
-        {
-            # JWPlatform iframe
-            'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved',
-            'info_dict': {
-                'id': 'AG26UQXM',
-                'ext': 'mp4',
-                'upload_date': '20160719',
-                'timestamp': 468923808,
-                'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4',
-            },
-            'add_ie': ['JWPlatform'],
-        },
         {
             # Video.js embed, multiple formats
             'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
@@ -2006,22 +1981,6 @@ class GenericIE(InfoExtractor):
             },
             'playlist_count': 6,
         },
-        {
-            # Squarespace video embed, 2019-08-28
-            'url': 'http://ootboxford.com',
-            'info_dict': {
-                'id': 'Tc7b_JGdZfw',
-                'title': 'Out of the Blue, at Childish Things 10',
-                'ext': 'mp4',
-                'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
-                'uploader_id': 'helendouglashouse',
-                'uploader': 'Helen & Douglas House',
-                'upload_date': '20140328',
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
         # {
         #     # Zype embed
         #     'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
@@ -2440,40 +2399,6 @@ class GenericIE(InfoExtractor):
                 'upload_date': '20210111',
             }
         },
-        {
-            'note': 'Rumble embed',
-            'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
-            'md5': '53af34098a7f92c4e51cf0bd1c33f009',
-            'info_dict': {
-                'id': 'vb0ofn',
-                'ext': 'mp4',
-                'timestamp': 1612662578,
-                'uploader': 'LovingMontana',
-                'channel': 'LovingMontana',
-                'upload_date': '20210207',
-                'title': 'Winter-loving dog helps girls dig a snow fort ',
-                'channel_url': 'https://rumble.com/c/c-546523',
-                'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
-                'duration': 103,
-            }
-        },
-        {
-            'note': 'Rumble JS embed',
-            'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
-            'md5': '4701209ac99095592e73dbba21889690',
-            'info_dict': {
-                'id': 'v15eqxl',
-                'ext': 'mp4',
-                'channel': 'Mr Producer Media',
-                'duration': 92,
-                'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
-                'channel_url': 'https://rumble.com/c/RichSementa',
-                'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
-                'timestamp': 1654892716,
-                'uploader': 'Mr Producer Media',
-                'upload_date': '20220610',
-            }
-        },
         {
             'note': 'JSON LD with multiple @type',
             'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
@@ -2490,6 +2415,21 @@ class GenericIE(InfoExtractor):
                 'duration': 111.0,
             }
         },
+        {
+            'note': 'JSON LD with unexpected data type',
+            'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
+            'info_dict': {
+                'id': 'porsche-911-gt3-rs-rij-impressie-2',
+                'ext': 'mp4',
+                'title': 'Test: Porsche 911 GT3 RS',
+                'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.',
+                'timestamp': 1664920902,
+                'upload_date': '20221004',
+                'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$',
+                'age_limit': 0,
+                'direct': True,
+            }
+        }
     ]
 
     def report_following_redirect(self, new_url):
@@ -2621,6 +2561,7 @@ def _real_extract(self, url):
                     default_search += ':'
                 return self.url_result(default_search + url)
 
+        original_url = url
         url, smuggled_data = unsmuggle_url(url, {})
         force_videoid = None
         is_intentional = smuggled_data.get('to_generic')
@@ -2643,7 +2584,9 @@ def _real_extract(self, url):
             **smuggled_data.get('http_headers', {})
         })
         new_url = full_response.geturl()
-        if url != new_url:
+        if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl():
+            url = new_url
+        elif url != new_url:
             self.report_following_redirect(new_url)
             if force_videoid:
                 new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
@@ -2676,7 +2619,6 @@ def _real_extract(self, url):
                     'vcodec': 'none' if m.group('type') == 'audio' else None
                 }]
                 info_dict['direct'] = True
-            self._sort_formats(formats)
             info_dict.update({
                 'formats': formats,
                 'subtitles': subtitles,
@@ -2694,7 +2636,6 @@ def _real_extract(self, url):
         if first_bytes.startswith(b'#EXTM3U'):
             self.report_detected('M3U playlist')
             info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
-            self._sort_formats(info_dict['formats'])
             return info_dict
 
         # Maybe it's a direct link to a video?
@@ -2728,12 +2669,10 @@ def _real_extract(self, url):
             elif doc.tag == 'SmoothStreamingMedia':
                 info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
                 self.report_detected('ISM manifest')
-                self._sort_formats(info_dict['formats'])
                 return info_dict
             elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
                 smil = self._parse_smil(doc, url, video_id)
                 self.report_detected('SMIL file')
-                self._sort_formats(smil['formats'])
                 return smil
             elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                 self.report_detected('XSPF playlist')
@@ -2748,12 +2687,10 @@ def _real_extract(self, url):
                     mpd_base_url=full_response.geturl().rpartition('/')[0],
                     mpd_url=url)
                 self.report_detected('DASH manifest')
-                self._sort_formats(info_dict['formats'])
                 return info_dict
             elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
                 info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
                 self.report_detected('F4M manifest')
-                self._sort_formats(info_dict['formats'])
                 return info_dict
         except xml.etree.ElementTree.ParseError:
             pass
@@ -2765,14 +2702,26 @@ def _real_extract(self, url):
             #   Site Name | Video Title
             #   Video Title - Tagline | Site Name
             # and so on and so forth; it's just not practical
-            'title': (self._og_search_title(webpage, default=None)
-                      or self._html_extract_title(webpage, 'video title', default='video')),
+            'title': self._generic_title('', webpage, default='video'),
             'description': self._og_search_description(webpage, default=None),
             'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'age_limit': self._rta_search(webpage),
         })
 
-        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
+        self._downloader.write_debug('Looking for embeds')
+        embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
+        if len(embeds) == 1:
+            return {**info_dict, **embeds[0]}
+        elif embeds:
+            return self.playlist_result(embeds, **info_dict)
+        raise UnsupportedError(url)
+
+    def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
+        """Returns an iterator of video entries"""
+        info_dict = types.MappingProxyType(info_dict)  # Prevents accidental mutation
+        video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
+        url, smuggled_data = unsmuggle_url(url, {})
+        actual_url = urlh.geturl() if urlh else url
 
         # Sometimes embedded video player is hidden behind percent encoding
         # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
@@ -2781,38 +2730,20 @@ def _real_extract(self, url):
         # There probably should be a second run of generic extractor on unescaped webpage.
         # webpage = urllib.parse.unquote(webpage)
 
-        # Unescape squarespace embeds to be detected by generic extractor,
-        # see https://github.com/ytdl-org/youtube-dl/issues/21294
-        webpage = re.sub(
-            r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
-            lambda x: unescapeHTML(x.group(0)), webpage)
-
         # TODO: Move to respective extractors
-        self._downloader.write_debug('Looking for Brightcove embeds')
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
         if bc_urls:
-            entries = [{
-                '_type': 'url',
-                'url': smuggle_url(bc_url, {'Referer': url}),
-                'ie_key': 'BrightcoveLegacy'
-            } for bc_url in bc_urls]
-
-            return {
-                '_type': 'playlist',
-                'title': info_dict['title'],
-                'id': video_id,
-                'entries': entries,
-            }
+            return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE)
+                    for bc_url in bc_urls]
         bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
         if bc_urls:
-            return self.playlist_from_matches(
-                bc_urls, video_id, info_dict['title'],
-                getter=lambda x: smuggle_url(x, {'referrer': url}),
-                ie='BrightcoveNew')
+            return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveNewIE)
+                    for bc_url in bc_urls]
 
-        self._downloader.write_debug('Looking for embeds')
         embeds = []
         for ie in self._downloader._ies.values():
+            if ie.ie_key() in smuggled_data.get('block_ies', []):
+                continue
             gen = ie.extract_from_webpage(self._downloader, url, webpage)
             current_embeds = []
             try:
@@ -2821,35 +2752,26 @@ def _real_extract(self, url):
             except self.StopExtraction:
                 self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
                                      embeds and 'discarding other embeds')
-                embeds = current_embeds
-                break
+                return current_embeds
             except StopIteration:
                 self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
                 embeds.extend(current_embeds)
 
-        del current_embeds
-        if len(embeds) == 1:
-            return {**info_dict, **embeds[0]}
-        elif embeds:
-            return self.playlist_result(embeds, **info_dict)
+        if embeds:
+            return embeds
 
         jwplayer_data = self._find_jwplayer_data(
             webpage, video_id, transform_source=js_to_json)
         if jwplayer_data:
             if isinstance(jwplayer_data.get('playlist'), str):
                 self.report_detected('JW Player playlist')
-                return {
-                    **info_dict,
-                    '_type': 'url',
-                    'ie_key': 'JWPlatform',
-                    'url': jwplayer_data['playlist'],
-                }
+                return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
             try:
                 info = self._parse_jwplayer_data(
                     jwplayer_data, video_id, require_title=False, base_url=url)
                 if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
                     self.report_detected('JW Player data')
-                    return merge_dicts(info, info_dict)
+                    return [info]
             except ExtractorError:
                 # See https://github.com/ytdl-org/youtube-dl/pull/16735
                 pass
@@ -2860,11 +2782,8 @@ def _real_extract(self, url):
             webpage)
         if mobj is not None:
             varname = mobj.group(1)
-            sources = self._parse_json(
-                mobj.group(2), video_id, transform_source=js_to_json,
-                fatal=False) or []
-            if not isinstance(sources, list):
-                sources = [sources]
+            sources = variadic(self._parse_json(
+                mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
             formats = []
             subtitles = {}
             for source in sources:
@@ -2877,7 +2796,7 @@ def _real_extract(self, url):
                     src_type = src_type.lower()
                 ext = determine_ext(src).lower()
                 if src_type == 'video/youtube':
-                    return self.url_result(src, YoutubeIE.ie_key())
+                    return [self.url_result(src, YoutubeIE.ie_key())]
                 if src_type == 'application/dash+xml' or ext == 'mpd':
                     fmts, subs = self._extract_mpd_formats_and_subtitles(
                         src, video_id, mpd_id='dash', fatal=False)
@@ -2895,7 +2814,7 @@ def _real_extract(self, url):
                         'ext': (mimetype2ext(src_type)
                                 or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
                         'http_headers': {
-                            'Referer': full_response.geturl(),
+                            'Referer': actual_url,
                         },
                     })
             # https://docs.videojs.com/player#addRemoteTextTrack
@@ -2910,28 +2829,25 @@ def _real_extract(self, url):
                     'url': urllib.parse.urljoin(url, src),
                     'name': sub.get('label'),
                     'http_headers': {
-                        'Referer': full_response.geturl(),
+                        'Referer': actual_url,
                     },
                 })
             if formats or subtitles:
                 self.report_detected('video.js embed')
-                self._sort_formats(formats)
-                info_dict['formats'] = formats
-                info_dict['subtitles'] = subtitles
-                return info_dict
+                return [{'formats': formats, 'subtitles': subtitles}]
 
         # Looking for http://schema.org/VideoObject
         json_ld = self._search_json_ld(webpage, video_id, default={})
         if json_ld.get('url') not in (url, None):
             self.report_detected('JSON LD')
-            return merge_dicts({
-                '_type': 'url_transparent',
+            return [merge_dicts({
+                '_type': 'video' if json_ld.get('ext') else 'url_transparent',
                 'url': smuggle_url(json_ld['url'], {
                     'force_videoid': video_id,
                     'to_generic': True,
                     'http_headers': {'Referer': url},
                 }),
-            }, json_ld, info_dict)
+            }, json_ld)]
 
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
@@ -3000,15 +2916,13 @@ def filter_video(urls):
                     if not formats[-1].get('height'):
                         formats[-1]['quality'] = 1
 
-                self._sort_formats(formats)
-
-                return {
+                return [{
                     'id': flashvars['video_id'],
                     'display_id': display_id,
                     'title': title,
                     'thumbnail': thumbnail,
                     'formats': formats,
-                }
+                }]
         if not found:
             # Broaden the search a little bit
             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
@@ -3062,17 +2976,14 @@ def filter_video(urls):
                 webpage)
             if not found:
                 # Look also in Refresh HTTP header
-                refresh_header = full_response.headers.get('Refresh')
+                refresh_header = urlh and urlh.headers.get('Refresh')
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
                 new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
                 if new_url != url:
                     self.report_following_redirect(new_url)
-                    return {
-                        '_type': 'url',
-                        'url': new_url,
-                    }
+                    return [self.url_result(new_url)]
                 else:
                     found = None
 
@@ -3083,10 +2994,12 @@ def filter_video(urls):
             embed_url = self._html_search_meta('twitter:player', webpage, default=None)
             if embed_url and embed_url != url:
                 self.report_detected('twitter:player iframe')
-                return self.url_result(embed_url)
+                return [self.url_result(embed_url)]
 
         if not found:
-            raise UnsupportedError(url)
+            return []
+
+        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
 
         entries = []
         for video_url in orderedSet(found):
@@ -3102,7 +3015,7 @@ def filter_video(urls):
 
             video_id = os.path.splitext(video_id)[0]
             headers = {
-                'referer': full_response.geturl()
+                'referer': actual_url
             }
 
             entry_info_dict = {
@@ -3126,7 +3039,7 @@ def filter_video(urls):
             if ext == 'smil':
                 entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
             elif ext == 'xspf':
-                return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+                return [self._extract_xspf_playlist(video_url, video_id)]
             elif ext == 'm3u8':
                 entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
             elif ext == 'mpd':
@@ -3151,19 +3064,11 @@ def filter_video(urls):
             else:
                 entry_info_dict['url'] = video_url
 
-            if entry_info_dict.get('formats'):
-                self._sort_formats(entry_info_dict['formats'])
-
             entries.append(entry_info_dict)
 
-        if len(entries) == 1:
-            return merge_dicts(entries[0], info_dict)
-        else:
+        if len(entries) > 1:
             for num, e in enumerate(entries, start=1):
                 # 'url' results don't have a title
                 if e.get('title') is not None:
                     e['title'] = '%s (%d)' % (e['title'], num)
-            return {
-                '_type': 'playlist',
-                'entries': entries,
-            }
+        return entries