]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/generic.py
[extractor] Framework for embed detection (#4307)
[yt-dlp.git] / yt_dlp / extractor / generic.py
index 1fcb0a53bf4f963ff496917592edef39278a1d4b..d6a6166a0a717be40195a5996a1849c587e10013 100644 (file)
@@ -1,7 +1,10 @@
 import os
 import re
 import os
 import re
+import urllib.parse
 import xml.etree.ElementTree
 
 import xml.etree.ElementTree
 
+from . import gen_extractor_classes
+from .common import InfoExtractor  # isort: split
 from .ant1newsgr import Ant1NewsGrEmbedIE
 from .anvato import AnvatoIE
 from .apa import APAIE
 from .ant1newsgr import Ant1NewsGrEmbedIE
 from .anvato import AnvatoIE
 from .apa import APAIE
@@ -13,7 +16,6 @@
 from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
 from .channel9 import Channel9IE
 from .cloudflarestream import CloudflareStreamIE
 from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
 from .channel9 import Channel9IE
 from .cloudflarestream import CloudflareStreamIE
-from .common import InfoExtractor
 from .commonprotocols import RtmpIE
 from .condenast import CondeNastIE
 from .dailymail import DailyMailIE
 from .commonprotocols import RtmpIE
 from .condenast import CondeNastIE
 from .dailymail import DailyMailIE
@@ -69,6 +71,7 @@
 from .sportbox import SportBoxIE
 from .spotify import SpotifyBaseIE
 from .springboardplatform import SpringboardPlatformIE
 from .sportbox import SportBoxIE
 from .spotify import SpotifyBaseIE
 from .springboardplatform import SpringboardPlatformIE
+from .substack import SubstackIE
 from .svt import SVTIE
 from .teachable import TeachableIE
 from .ted import TedEmbedIE
 from .svt import SVTIE
 from .teachable import TeachableIE
 from .ted import TedEmbedIE
 from .youporn import YouPornIE
 from .youtube import YoutubeIE
 from .zype import ZypeIE
 from .youporn import YouPornIE
 from .youtube import YoutubeIE
 from .zype import ZypeIE
-from ..compat import (
-    compat_etree_fromstring,
-    compat_str,
-    compat_urllib_parse_unquote,
-    compat_urlparse,
-)
+from ..compat import compat_etree_fromstring
 from ..utils import (
     KNOWN_EXTENSIONS,
     ExtractorError,
 from ..utils import (
     KNOWN_EXTENSIONS,
     ExtractorError,
-    HEADRequest,
     UnsupportedError,
     determine_ext,
     dict_get,
     float_or_none,
     UnsupportedError,
     determine_ext,
     dict_get,
     float_or_none,
+    format_field,
     int_or_none,
     is_html,
     js_to_json,
     int_or_none,
     is_html,
     js_to_json,
     orderedSet,
     parse_duration,
     parse_resolution,
     orderedSet,
     parse_duration,
     parse_resolution,
-    sanitized_Request,
     smuggle_url,
     str_or_none,
     try_call,
     smuggle_url,
     str_or_none,
     try_call,
@@ -145,7 +142,7 @@ class GenericIE(InfoExtractor):
     IE_DESC = 'Generic downloader that works on some sites'
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
     IE_DESC = 'Generic downloader that works on some sites'
     _VALID_URL = r'.*'
     IE_NAME = 'generic'
-    _NETRC_MACHINE = False  # Supress username warning
+    _NETRC_MACHINE = False  # Suppress username warning
     _TESTS = [
         # Direct link to a video
         {
     _TESTS = [
         # Direct link to a video
         {
@@ -2542,7 +2539,34 @@ class GenericIE(InfoExtractor):
                 'timestamp': 1652833414,
                 'age_limit': 0,
             }
                 'timestamp': 1652833414,
                 'age_limit': 0,
             }
-        }, {
+        },
+        {
+            'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
+            'md5': '198bde8bed23d0b23c70725c83c9b6d9',
+            'info_dict': {
+                'id': '53602801',
+                'ext': 'mpga',
+                'title': 'Interstellar',
+                'description': 'Listen now | Episode One',
+                'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538',
+                'uploader': 'Molly Movie Club',
+                'uploader_id': '839621',
+            },
+        },
+        {
+            'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r',
+            'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0',
+            'info_dict': {
+                'id': '57962052',
+                'ext': 'mpga',
+                'title': 'md5:855b2756f0ee10f6723fa00b16266f8d',
+                'description': 'md5:fe512a5e94136ad260c80bde00ea4eef',
+                'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59',
+                'uploader': 'Blocked and Reported',
+                'uploader_id': '500230',
+            },
+        },
+        {
             'url': 'https://www.skimag.com/video/ski-people-1980/',
             'info_dict': {
                 'id': 'ski-people-1980',
             'url': 'https://www.skimag.com/video/ski-people-1980/',
             'info_dict': {
                 'id': 'ski-people-1980',
@@ -2596,16 +2620,38 @@ class GenericIE(InfoExtractor):
                 'uploader': 'Mr Producer Media',
                 'upload_date': '20220610',
             }
                 'uploader': 'Mr Producer Media',
                 'upload_date': '20220610',
             }
-
-        }
+        },
+        {
+            'note': 'JSON LD with multiple @type',
+            'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
+            'md5': 'c7949f34f57273013fb7ccb1156393db',
+            'info_dict': {
+                'id': 'ipy2AcGL',
+                'ext': 'mp4',
+                'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
+                'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg',
+                'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
+                'timestamp': 1586577474,
+                'upload_date': '20200411',
+                'age_limit': 0,
+                'duration': 111.0,
+            }
+        },
     ]
 
     def report_following_redirect(self, new_url):
         """Report information extraction."""
         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 
     ]
 
     def report_following_redirect(self, new_url):
         """Report information extraction."""
         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 
-    def report_detected(self, name):
-        self._downloader.write_debug(f'Identified a {name}')
+    def report_detected(self, name, num=1, note=None):
+        if num > 1:
+            name += 's'
+        elif not num:
+            return
+        else:
+            num = 'a'
+
+        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
 
     def _extract_rss(self, url, video_id, doc):
         NS_MAP = {
 
     def _extract_rss(self, url, video_id, doc):
         NS_MAP = {
@@ -2660,7 +2706,7 @@ def _extract_camtasia(self, url, video_id, webpage):
 
         title = self._html_search_meta('DC.title', webpage, fatal=True)
 
 
         title = self._html_search_meta('DC.title', webpage, fatal=True)
 
-        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
+        camtasia_url = urllib.parse.urljoin(url, camtasia_cfg)
         camtasia_cfg = self._download_xml(
             camtasia_url, video_id,
             note='Downloading camtasia configuration',
         camtasia_cfg = self._download_xml(
             camtasia_url, video_id,
             note='Downloading camtasia configuration',
@@ -2676,7 +2722,7 @@ def _extract_camtasia(self, url, video_id, webpage):
             entries.append({
                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
                 'title': f'{title} - {n.tag}',
             entries.append({
                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
                 'title': f'{title} - {n.tag}',
-                'url': compat_urlparse.urljoin(url, url_n.text),
+                'url': urllib.parse.urljoin(url, url_n.text),
                 'duration': float_or_none(n.find('./duration').text),
             })
 
                 'duration': float_or_none(n.find('./duration').text),
             })
 
@@ -2728,7 +2774,7 @@ def _real_extract(self, url):
         if url.startswith('//'):
             return self.url_result(self.http_scheme() + url)
 
         if url.startswith('//'):
             return self.url_result(self.http_scheme() + url)
 
-        parsed_url = compat_urlparse.urlparse(url)
+        parsed_url = urllib.parse.urlparse(url)
         if not parsed_url.scheme:
             default_search = self.get_param('default_search')
             if default_search is None:
         if not parsed_url.scheme:
             default_search = self.get_param('default_search')
             if default_search is None:
@@ -2768,43 +2814,34 @@ def _real_extract(self, url):
         else:
             video_id = self._generic_id(url)
 
         else:
             video_id = self._generic_id(url)
 
-        self.to_screen('%s: Requesting header' % video_id)
-
-        head_req = HEADRequest(url)
-        head_response = self._request_webpage(
-            head_req, video_id,
-            note=False, errnote='Could not send HEAD request to %s' % url,
-            fatal=False)
-
-        if head_response is not False:
-            # Check for redirect
-            new_url = head_response.geturl()
-            if url != new_url:
-                self.report_following_redirect(new_url)
-                if force_videoid:
-                    new_url = smuggle_url(
-                        new_url, {'force_videoid': force_videoid})
-                return self.url_result(new_url)
-
-        full_response = None
-        if head_response is False:
-            request = sanitized_Request(url)
-            request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
-            head_response = full_response
+        # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+        # making it impossible to download only chunk of the file (yet we need only 512kB to
+        # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
+        # that will always result in downloading the whole file that is not desirable.
+        # Therefore for extraction pass we have to override Accept-Encoding to any in order
+        # to accept raw bytes and being able to download only a chunk.
+        # It may probably better to solve this by checking Content-Type for application/octet-stream
+        # after a HEAD request, but not sure if we can rely on this.
+        full_response = self._request_webpage(url, video_id, headers={'Accept-Encoding': '*'})
+        new_url = full_response.geturl()
+        if url != new_url:
+            self.report_following_redirect(new_url)
+            if force_videoid:
+                new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
+            return self.url_result(new_url)
 
         info_dict = {
             'id': video_id,
             'title': self._generic_title(url),
 
         info_dict = {
             'id': video_id,
             'title': self._generic_title(url),
-            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(full_response.headers.get('Last-Modified'))
         }
 
         # Check for direct link to a video
         }
 
         # Check for direct link to a video
-        content_type = head_response.headers.get('Content-Type', '').lower()
+        content_type = full_response.headers.get('Content-Type', '').lower()
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
         if m:
             self.report_detected('direct video link')
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
         if m:
             self.report_detected('direct video link')
-            format_id = compat_str(m.group('format_id'))
+            format_id = str(m.group('format_id'))
             subtitles = {}
             if format_id.endswith('mpegurl'):
                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
             subtitles = {}
             if format_id.endswith('mpegurl'):
                 formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
@@ -2826,21 +2863,7 @@ def _real_extract(self, url):
 
         if not self.get_param('test', False) and not is_intentional:
             force = self.get_param('force_generic_extractor', False)
 
         if not self.get_param('test', False) and not is_intentional:
             force = self.get_param('force_generic_extractor', False)
-            self.report_warning(
-                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
-
-        if not full_response:
-            request = sanitized_Request(url)
-            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
-            # making it impossible to download only chunk of the file (yet we need only 512kB to
-            # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
-            # that will always result in downloading the whole file that is not desirable.
-            # Therefore for extraction pass we have to override Accept-Encoding to any in order
-            # to accept raw bytes and being able to download only a chunk.
-            # It may probably better to solve this by checking Content-Type for application/octet-stream
-            # after HEAD request finishes, but not sure if we can rely on this.
-            request.add_header('Accept-Encoding', '*')
-            full_response = self._request_webpage(request, video_id)
+            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
 
         first_bytes = full_response.read(512)
 
 
         first_bytes = full_response.read(512)
 
@@ -2918,12 +2941,28 @@ def _real_extract(self, url):
             self.report_detected('Camtasia video')
             return camtasia_res
 
             self.report_detected('Camtasia video')
             return camtasia_res
 
+        info_dict.update({
+            # it's tempting to parse this further, but you would
+            # have to take into account all the variations like
+            #   Video Title - Site Name
+            #   Site Name | Video Title
+            #   Video Title - Tagline | Site Name
+            # and so on and so forth; it's just not practical
+            'title': (self._og_search_title(webpage, default=None)
+                      or self._html_extract_title(webpage, 'video title', default='video')),
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'age_limit': self._rta_search(webpage),
+        })
+
+        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+
         # Sometimes embedded video player is hidden behind percent encoding
         # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
         # Unescaping the whole page allows to handle those cases in a generic way
         # FIXME: unescaping the whole page may break URLs, commenting out for now.
         # There probably should be a second run of generic extractor on unescaped webpage.
         # Sometimes embedded video player is hidden behind percent encoding
         # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
         # Unescaping the whole page allows to handle those cases in a generic way
         # FIXME: unescaping the whole page may break URLs, commenting out for now.
         # There probably should be a second run of generic extractor on unescaped webpage.
-        # webpage = compat_urllib_parse_unquote(webpage)
+        # webpage = urllib.parse.unquote(webpage)
 
         # Unescape squarespace embeds to be detected by generic extractor,
         # see https://github.com/ytdl-org/youtube-dl/issues/21294
 
         # Unescape squarespace embeds to be detected by generic extractor,
         # see https://github.com/ytdl-org/youtube-dl/issues/21294
@@ -2931,40 +2970,12 @@ def _real_extract(self, url):
             r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
             lambda x: unescapeHTML(x.group(0)), webpage)
 
             r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
             lambda x: unescapeHTML(x.group(0)), webpage)
 
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = (self._og_search_title(webpage, default=None)
-                       or self._html_extract_title(webpage, 'video title', default='video'))
-
-        # Try to detect age limit automatically
-        age_limit = self._rta_search(webpage)
-        # And then there are the jokers who advertise that they use RTA,
-        # but actually don't.
-        AGE_LIMIT_MARKERS = [
-            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
-        ]
-        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
-            age_limit = 18
-
-        # video uploader is domain name
-        video_uploader = self._search_regex(
-            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
-
-        video_description = self._og_search_description(webpage, default=None)
-        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        info_dict.update({
-            'title': video_title,
-            'description': video_description,
-            'thumbnail': video_thumbnail,
-            'age_limit': age_limit,
-        })
+        # TODO: Remove
+        video_title, video_description, video_thumbnail, age_limit, video_uploader = \
+            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
 
 
-        self._downloader.write_debug('Looking for video embeds')
+        # TODO: Move Embeds
+        self._downloader.write_debug('Looking for single embeds')
 
         # Look for Brightcove Legacy Studio embeds
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
 
         # Look for Brightcove Legacy Studio embeds
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
@@ -2983,7 +2994,7 @@ def _real_extract(self, url):
             }
 
         # Look for Brightcove New Studio embeds
             }
 
         # Look for Brightcove New Studio embeds
-        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+        bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
         if bc_urls:
             return self.playlist_from_matches(
                 bc_urls, video_id, video_title,
         if bc_urls:
             return self.playlist_from_matches(
                 bc_urls, video_id, video_title,
@@ -3073,6 +3084,7 @@ def _real_extract(self, url):
         wistia_urls = WistiaIE._extract_urls(webpage)
         if wistia_urls:
             playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
         wistia_urls = WistiaIE._extract_urls(webpage)
         if wistia_urls:
             playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+            playlist['entries'] = list(playlist['entries'])
             for entry in playlist['entries']:
                 entry.update({
                     '_type': 'url_transparent',
             for entry in playlist['entries']:
                 entry.update({
                     '_type': 'url_transparent',
@@ -3092,6 +3104,11 @@ def _real_extract(self, url):
             # Don't set the extractor because it can be a track url or an album
             return self.url_result(burl)
 
             # Don't set the extractor because it can be a track url or an album
             return self.url_result(burl)
 
+        # Check for Substack custom domains
+        substack_url = SubstackIE._extract_url(webpage, url)
+        if substack_url:
+            return self.url_result(substack_url, SubstackIE)
+
         # Look for embedded Vevo player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
         # Look for embedded Vevo player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
@@ -3190,7 +3207,7 @@ def _real_extract(self, url):
             return self.url_result(mobj.group('url'))
         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
         if mobj is not None:
             return self.url_result(mobj.group('url'))
         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
         if mobj is not None:
-            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
+            return self.url_result(urllib.parse.unquote(mobj.group('url')))
 
         # Look for funnyordie embed
         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
 
         # Look for funnyordie embed
         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -3225,7 +3242,7 @@ def _real_extract(self, url):
             return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
 
         # Look for embedded Spotify player
             return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
 
         # Look for embedded Spotify player
-        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        spotify_urls = SpotifyBaseIE._extract_urls(webpage)
         if spotify_urls:
             return self.playlist_from_matches(spotify_urls, video_id, video_title)
 
         if spotify_urls:
             return self.playlist_from_matches(spotify_urls, video_id, video_title)
 
@@ -3443,7 +3460,7 @@ def _real_extract(self, url):
             r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
         if mobj is not None:
             return self.url_result(
             r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
         if mobj is not None:
             return self.url_result(
-                compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
+                urllib.parse.urljoin(url, mobj.group('url')), 'UDNEmbed')
 
         # Look for Senate ISVP iframe
         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
 
         # Look for Senate ISVP iframe
         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
@@ -3676,7 +3693,7 @@ def _real_extract(self, url):
         if mediasite_urls:
             entries = [
                 self.url_result(smuggle_url(
         if mediasite_urls:
             entries = [
                 self.url_result(smuggle_url(
-                    compat_urlparse.urljoin(url, mediasite_url),
+                    urllib.parse.urljoin(url, mediasite_url),
                     {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
                 for mediasite_url in mediasite_urls]
             return self.playlist_result(entries, video_id, video_title)
                     {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
                 for mediasite_url in mediasite_urls]
             return self.playlist_result(entries, video_id, video_title)
@@ -3816,6 +3833,30 @@ def _real_extract(self, url):
         tiktok_urls = TikTokIE._extract_urls(webpage)
         if tiktok_urls:
             return self.playlist_from_matches(tiktok_urls, video_id, video_title)
         tiktok_urls = TikTokIE._extract_urls(webpage)
         if tiktok_urls:
             return self.playlist_from_matches(tiktok_urls, video_id, video_title)
+        # TODO: END: Move Embeds
+
+        self._downloader.write_debug('Looking for embeds')
+        embeds = []
+        for ie in gen_extractor_classes():
+            gen = ie.extract_from_webpage(self._downloader, url, webpage)
+            current_embeds = []
+            try:
+                while True:
+                    current_embeds.append(next(gen))
+            except self.StopExtraction:
+                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
+                                     embeds and 'discarding other embeds')
+                embeds = current_embeds
+                break
+            except StopIteration:
+                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
+                embeds.extend(current_embeds)
+
+        del current_embeds
+        if len(embeds) == 1:
+            return {**info_dict, **embeds[0]}
+        elif embeds:
+            return self.playlist_result(embeds, **info_dict)
 
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
 
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -3871,11 +3912,11 @@ def _real_extract(self, url):
             subtitles = {}
             for source in sources:
                 src = source.get('src')
             subtitles = {}
             for source in sources:
                 src = source.get('src')
-                if not src or not isinstance(src, compat_str):
+                if not src or not isinstance(src, str):
                     continue
                     continue
-                src = compat_urlparse.urljoin(url, src)
+                src = urllib.parse.urljoin(url, src)
                 src_type = source.get('type')
                 src_type = source.get('type')
-                if isinstance(src_type, compat_str):
+                if isinstance(src_type, str):
                     src_type = src_type.lower()
                 ext = determine_ext(src).lower()
                 if src_type == 'video/youtube':
                     src_type = src_type.lower()
                 ext = determine_ext(src).lower()
                 if src_type == 'video/youtube':
@@ -3909,7 +3950,7 @@ def _real_extract(self, url):
                 if not src:
                     continue
                 subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
                 if not src:
                     continue
                 subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-                    'url': compat_urlparse.urljoin(url, src),
+                    'url': urllib.parse.urljoin(url, src),
                     'name': sub.get('label'),
                     'http_headers': {
                         'Referer': full_response.geturl(),
                     'name': sub.get('label'),
                     'http_headers': {
                         'Referer': full_response.geturl(),
@@ -3936,7 +3977,7 @@ def check_video(vurl):
                 return True
             if RtmpIE.suitable(vurl):
                 return True
                 return True
             if RtmpIE.suitable(vurl):
                 return True
-            vpath = compat_urlparse.urlparse(vurl).path
+            vpath = urllib.parse.urlparse(vurl).path
             vext = determine_ext(vpath, None)
             return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
 
             vext = determine_ext(vpath, None)
             return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
 
@@ -4060,11 +4101,11 @@ def filter_video(urls):
                 webpage)
             if not found:
                 # Look also in Refresh HTTP header
                 webpage)
             if not found:
                 # Look also in Refresh HTTP header
-                refresh_header = head_response.headers.get('Refresh')
+                refresh_header = full_response.headers.get('Refresh')
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
                 if refresh_header:
                     found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
-                new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
+                new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
                 if new_url != url:
                     self.report_following_redirect(new_url)
                     return {
                 if new_url != url:
                     self.report_following_redirect(new_url)
                     return {
@@ -4090,15 +4131,14 @@ def filter_video(urls):
         for video_url in orderedSet(found):
             video_url = unescapeHTML(video_url)
             video_url = video_url.replace('\\/', '/')
         for video_url in orderedSet(found):
             video_url = unescapeHTML(video_url)
             video_url = video_url.replace('\\/', '/')
-            video_url = compat_urlparse.urljoin(url, video_url)
-            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
+            video_url = urllib.parse.urljoin(url, video_url)
+            video_id = urllib.parse.unquote(os.path.basename(video_url))
 
             # Sometimes, jwplayer extraction will result in a YouTube URL
             if YoutubeIE.suitable(video_url):
                 entries.append(self.url_result(video_url, 'Youtube'))
                 continue
 
 
             # Sometimes, jwplayer extraction will result in a YouTube URL
             if YoutubeIE.suitable(video_url):
                 entries.append(self.url_result(video_url, 'Youtube'))
                 continue
 
-            # here's a fun little line of code for you:
             video_id = os.path.splitext(video_id)[0]
             headers = {
                 'referer': full_response.geturl()
             video_id = os.path.splitext(video_id)[0]
             headers = {
                 'referer': full_response.geturl()