]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/generic.py
[cleanup] Misc cleanup
[yt-dlp.git] / yt_dlp / extractor / generic.py
index 0d0e002e50cb929e18b7d8f08698a1eb5e802252..b63271c1fb7c1787383c04a2050ea1d42b0e5723 100644 (file)
@@ -1,5 +1,6 @@
 import os
 import re
+import urllib.parse
 import xml.etree.ElementTree
 
 from .ant1newsgr import Ant1NewsGrEmbedIE
 from .sportbox import SportBoxIE
 from .spotify import SpotifyBaseIE
 from .springboardplatform import SpringboardPlatformIE
+from .substack import SubstackIE
 from .svt import SVTIE
 from .teachable import TeachableIE
 from .ted import TedEmbedIE
 from .theplatform import ThePlatformIE
 from .threeqsdn import ThreeQSDNIE
+from .tiktok import TikTokIE
 from .tnaflix import TNAFlixNetworkEmbedIE
 from .tube8 import Tube8IE
 from .tunein import TuneInBaseIE
 from .youporn import YouPornIE
 from .youtube import YoutubeIE
 from .zype import ZypeIE
-from ..compat import (
-    compat_etree_fromstring,
-    compat_str,
-    compat_urllib_parse_unquote,
-    compat_urlparse,
-)
+from ..compat import compat_etree_fromstring
 from ..utils import (
     KNOWN_EXTENSIONS,
     ExtractorError,
     sanitized_Request,
     smuggle_url,
     str_or_none,
+    try_call,
     unescapeHTML,
     unified_timestamp,
     unsmuggle_url,
@@ -143,7 +142,7 @@ class GenericIE(InfoExtractor):
     IE_DESC = 'Generic downloader that works on some sites'
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
-    _NETRC_MACHINE = False  # Supress username warning
+    _NETRC_MACHINE = False  # Suppress username warning
     _TESTS = [
         # Direct link to a video
         {
@@ -2526,6 +2525,118 @@ class GenericIE(InfoExtractor):
                 'upload_date': '20220504',
             },
         },
+        {
+            # Webpage contains double BOM
+            'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
+            'md5': 'df02cadc719dcc63d43288366f037754',
+            'info_dict': {
+                'id': 'paris-d-moll',
+                'ext': 'mp4',
+                'upload_date': '20220518',
+                'title': 'Paris d-moll',
+                'description': 'md5:319e37ea5542293db37e1e13072fe330',
+                'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg',
+                'timestamp': 1652833414,
+                'age_limit': 0,
+            }
+        },
+        {
+            'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
+            'md5': '198bde8bed23d0b23c70725c83c9b6d9',
+            'info_dict': {
+                'id': '53602801',
+                'ext': 'mpga',
+                'title': 'Interstellar',
+                'description': 'Listen now | Episode One',
+                'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538',
+                'uploader': 'Molly Movie Club',
+                'uploader_id': '839621',
+            },
+        },
+        {
+            'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r',
+            'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0',
+            'info_dict': {
+                'id': '57962052',
+                'ext': 'mpga',
+                'title': 'md5:855b2756f0ee10f6723fa00b16266f8d',
+                'description': 'md5:fe512a5e94136ad260c80bde00ea4eef',
+                'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59',
+                'uploader': 'Blocked and Reported',
+                'uploader_id': '500230',
+            },
+        },
+        {
+            'url': 'https://www.skimag.com/video/ski-people-1980/',
+            'info_dict': {
+                'id': 'ski-people-1980',
+                'title': 'Ski People (1980)',
+            },
+            'playlist_count': 1,
+            'playlist': [{
+                'md5': '022a7e31c70620ebec18deeab376ee03',
+                'info_dict': {
+                    'id': 'YTmgRiNU',
+                    'ext': 'mp4',
+                    'title': '1980 Ski People',
+                    'timestamp': 1610407738,
+                    'description': 'md5:cf9c3d101452c91e141f292b19fe4843',
+                    'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720',
+                    'duration': 5688.0,
+                    'upload_date': '20210111',
+                }
+            }]
+        },
+        {
+            'note': 'Rumble embed',
+            'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
+            'md5': '53af34098a7f92c4e51cf0bd1c33f009',
+            'info_dict': {
+                'id': 'vb0ofn',
+                'ext': 'mp4',
+                'timestamp': 1612662578,
+                'uploader': 'LovingMontana',
+                'channel': 'LovingMontana',
+                'upload_date': '20210207',
+                'title': 'Winter-loving dog helps girls dig a snow fort ',
+                'channel_url': 'https://rumble.com/c/c-546523',
+                'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
+                'duration': 103,
+            }
+        },
+        {
+            'note': 'Rumble JS embed',
+            'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
+            'md5': '4701209ac99095592e73dbba21889690',
+            'info_dict': {
+                'id': 'v15eqxl',
+                'ext': 'mp4',
+                'channel': 'Mr Producer Media',
+                'duration': 92,
+                'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
+                'channel_url': 'https://rumble.com/c/RichSementa',
+                'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
+                'timestamp': 1654892716,
+                'uploader': 'Mr Producer Media',
+                'upload_date': '20220610',
+            }
+        },
+        {
+            'note': 'JSON LD with multiple @type',
+            'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
+            'md5': 'c7949f34f57273013fb7ccb1156393db',
+            'info_dict': {
+                'id': 'ipy2AcGL',
+                'ext': 'mp4',
+                'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
+                'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg',
+                'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
+                'timestamp': 1586577474,
+                'upload_date': '20200411',
+                'age_limit': 0,
+                'duration': 111.0,
+            }
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -2536,66 +2647,44 @@ def report_detected(self, name):
         self._downloader.write_debug(f'Identified a {name}')
 
     def _extract_rss(self, url, video_id, doc):
-        playlist_title = doc.find('./channel/title').text
-        playlist_desc_el = doc.find('./channel/description')
-        playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
-
         NS_MAP = {
             'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
         }
 
         entries = []
         for it in doc.findall('./channel/item'):
-            next_url = None
-            enclosure_nodes = it.findall('./enclosure')
-            for e in enclosure_nodes:
-                next_url = e.attrib.get('url')
-                if next_url:
-                    break
-
-            if not next_url:
-                next_url = xpath_text(it, 'link', fatal=False)
-
+            next_url = next(
+                (e.attrib.get('url') for e in it.findall('./enclosure')),
+                xpath_text(it, 'link', fatal=False))
             if not next_url:
                 continue
 
-            if it.find('guid').text is not None:
-                next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text})
+            guid = try_call(lambda: it.find('guid').text)
+            if guid:
+                next_url = smuggle_url(next_url, {'force_videoid': guid})
 
             def itunes(key):
-                return xpath_text(
-                    it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
-                    default=None)
-
-            duration = itunes('duration')
-            explicit = (itunes('explicit') or '').lower()
-            if explicit in ('true', 'yes'):
-                age_limit = 18
-            elif explicit in ('false', 'no'):
-                age_limit = 0
-            else:
-                age_limit = None
+                return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None)
 
             entries.append({
                 '_type': 'url_transparent',
                 'url': next_url,
-                'title': it.find('title').text,
+                'title': try_call(lambda: it.find('title').text),
                 'description': xpath_text(it, 'description', default=None),
-                'timestamp': unified_timestamp(
-                    xpath_text(it, 'pubDate', default=None)),
-                'duration': int_or_none(duration) or parse_duration(duration),
+                'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)),
+                'duration': parse_duration(itunes('duration')),
                 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
                 'episode': itunes('title'),
                 'episode_number': int_or_none(itunes('episode')),
                 'season_number': int_or_none(itunes('season')),
-                'age_limit': age_limit,
+                'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()),
             })
 
         return {
             '_type': 'playlist',
             'id': url,
-            'title': playlist_title,
-            'description': playlist_desc,
+            'title': try_call(lambda: doc.find('./channel/title').text),
+            'description': try_call(lambda: doc.find('./channel/description').text),
             'entries': entries,
         }
 
@@ -2610,7 +2699,7 @@ def _extract_camtasia(self, url, video_id, webpage):
 
         title = self._html_search_meta('DC.title', webpage, fatal=True)
 
-        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
+        camtasia_url = urllib.parse.urljoin(url, camtasia_cfg)
         camtasia_cfg = self._download_xml(
             camtasia_url, video_id,
             note='Downloading camtasia configuration',
@@ -2626,7 +2715,7 @@ def _extract_camtasia(self, url, video_id, webpage):
             entries.append({
                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
                 'title': f'{title} - {n.tag}',
-                'url': compat_urlparse.urljoin(url, url_n.text),
+                'url': urllib.parse.urljoin(url, url_n.text),
                 'duration': float_or_none(n.find('./duration').text),
             })
 
@@ -2678,7 +2767,7 @@ def _real_extract(self, url):
         if url.startswith('//'):
             return self.url_result(self.http_scheme() + url)
 
-        parsed_url = compat_urlparse.urlparse(url)
+        parsed_url = urllib.parse.urlparse(url)
         if not parsed_url.scheme:
             default_search = self.get_param('default_search')
             if default_search is None:
@@ -2736,12 +2825,22 @@ def _real_extract(self, url):
                         new_url, {'force_videoid': force_videoid})
                 return self.url_result(new_url)
 
-        full_response = None
-        if head_response is False:
+        def request_webpage():
             request = sanitized_Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
             request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
-            head_response = full_response
+            return self._request_webpage(request, video_id)
+
+        full_response = None
+        if head_response is False:
+            head_response = full_response = request_webpage()
 
         info_dict = {
             'id': video_id,
@@ -2754,7 +2853,7 @@ def _real_extract(self, url):
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
         if m:
             self.report_detected('direct video link')
-            format_id = compat_str(m.group('format_id'))
+            format_id = str(m.group('format_id'))
             subtitles = {}
             if format_id.endswith('mpegurl'):
                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
@@ -2779,19 +2878,7 @@ def _real_extract(self, url):
             self.report_warning(
                 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
 
-        if not full_response:
-            request = sanitized_Request(url)
-            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
-            # making it impossible to download only chunk of the file (yet we need only 512kB to
-            # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
-            # that will always result in downloading the whole file that is not desirable.
-            # Therefore for extraction pass we have to override Accept-Encoding to any in order
-            # to accept raw bytes and being able to download only a chunk.
-            # It may probably better to solve this by checking Content-Type for application/octet-stream
-            # after HEAD request finishes, but not sure if we can rely on this.
-            request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
-
+        full_response = full_response or request_webpage()
         first_bytes = full_response.read(512)
 
         # Is it an M3U playlist?
@@ -2873,7 +2960,7 @@ def _real_extract(self, url):
         # Unescaping the whole page allows to handle those cases in a generic way
         # FIXME: unescaping the whole page may break URLs, commenting out for now.
         # There probably should be a second run of generic extractor on unescaped webpage.
-        # webpage = compat_urllib_parse_unquote(webpage)
+        # webpage = urllib.parse.unquote(webpage)
 
         # Unescape squarespace embeds to be detected by generic extractor,
         # see https://github.com/ytdl-org/youtube-dl/issues/21294
@@ -2975,7 +3062,7 @@ def _real_extract(self, url):
         if vimeo_urls:
             return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
 
-        vhx_url = VHXEmbedIE._extract_url(webpage)
+        vhx_url = VHXEmbedIE._extract_url(url, webpage)
         if vhx_url:
             return self.url_result(vhx_url, VHXEmbedIE.ie_key())
 
@@ -3023,6 +3110,7 @@ def _real_extract(self, url):
         wistia_urls = WistiaIE._extract_urls(webpage)
         if wistia_urls:
             playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+            playlist['entries'] = list(playlist['entries'])
             for entry in playlist['entries']:
                 entry.update({
                     '_type': 'url_transparent',
@@ -3042,6 +3130,11 @@ def _real_extract(self, url):
             # Don't set the extractor because it can be a track url or an album
             return self.url_result(burl)
 
+        # Check for Substack custom domains
+        substack_url = SubstackIE._extract_url(webpage, url)
+        if substack_url:
+            return self.url_result(substack_url, SubstackIE)
+
         # Look for embedded Vevo player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
@@ -3140,7 +3233,7 @@ def _real_extract(self, url):
             return self.url_result(mobj.group('url'))
         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
         if mobj is not None:
-            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
+            return self.url_result(urllib.parse.unquote(mobj.group('url')))
 
         # Look for funnyordie embed
         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -3393,7 +3486,7 @@ def _real_extract(self, url):
             r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
         if mobj is not None:
             return self.url_result(
-                compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
+                urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed')
 
         # Look for Senate ISVP iframe
         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
@@ -3626,7 +3719,7 @@ def _real_extract(self, url):
         if mediasite_urls:
             entries = [
                 self.url_result(smuggle_url(
-                    compat_urlparse.urljoin(url, mediasite_url),
+                    urllib.parse.urljoin(url, mediasite_url),
                     {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
                 for mediasite_url in mediasite_urls]
             return self.playlist_result(entries, video_id, video_title)
@@ -3762,6 +3855,11 @@ def _real_extract(self, url):
         if ruutu_urls:
             return self.playlist_from_matches(ruutu_urls, video_id, video_title)
 
+        # Look for Tiktok embeds
+        tiktok_urls = TikTokIE._extract_urls(webpage)
+        if tiktok_urls:
+            return self.playlist_from_matches(tiktok_urls, video_id, video_title)
+
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
         if entries:
@@ -3816,11 +3914,11 @@ def _real_extract(self, url):
             subtitles = {}
             for source in sources:
                 src = source.get('src')
-                if not src or not isinstance(src, compat_str):
+                if not src or not isinstance(src, str):
                     continue
-                src = compat_urlparse.urljoin(url, src)
+                src = urllib.parse.urljoin(url, src)
                 src_type = source.get('type')
-                if isinstance(src_type, compat_str):
+                if isinstance(src_type, str):
                     src_type = src_type.lower()
                 ext = determine_ext(src).lower()
                 if src_type == 'video/youtube':
@@ -3854,7 +3952,7 @@ def _real_extract(self, url):
                 if not src:
                     continue
                 subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-                    'url': compat_urlparse.urljoin(url, src),
+                    'url': urllib.parse.urljoin(url, src),
                     'name': sub.get('label'),
                     'http_headers': {
                         'Referer': full_response.geturl(),
@@ -3871,22 +3969,17 @@ def _real_extract(self, url):
         json_ld = self._search_json_ld(webpage, video_id, default={})
         if json_ld.get('url') not in (url, None):
             self.report_detected('JSON LD')
-            if determine_ext(json_ld['url']) == 'm3u8':
-                json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
-                    json_ld['url'], video_id, 'mp4')
-                json_ld.pop('url')
-                self._sort_formats(json_ld['formats'])
-            else:
-                json_ld['_type'] = 'url_transparent'
-                json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True})
-            return merge_dicts(json_ld, info_dict)
+            return merge_dicts({
+                '_type': 'url_transparent',
+                'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}),
+            }, json_ld, info_dict)
 
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
             if RtmpIE.suitable(vurl):
                 return True
-            vpath = compat_urlparse.urlparse(vurl).path
+            vpath = urllib.parse.urlparse(vurl).path
             vext = determine_ext(vpath, None)
             return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
 
@@ -4014,7 +4107,7 @@ def filter_video(urls):
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
-                new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
+                new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
                 if new_url != url:
                     self.report_following_redirect(new_url)
                     return {
@@ -4040,8 +4133,8 @@ def filter_video(urls):
         for video_url in orderedSet(found):
             video_url = unescapeHTML(video_url)
             video_url = video_url.replace('\\/', '/')
-            video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
+            video_url = urllib.parse.urljoin(url, video_url)
+            video_id = urllib.parse.unquote(os.path.basename(video_url))
 
             # Sometimes, jwplayer extraction will result in a YouTube URL
             if YoutubeIE.suitable(video_url):
@@ -4106,7 +4199,7 @@ def filter_video(urls):
             entries.append(entry_info_dict)
 
         if len(entries) == 1:
-            return entries[0]
+            return merge_dicts(entries[0], info_dict)
         else:
             for num, e in enumerate(entries, start=1):
                 # 'url' results don't have a title