[cleanup] Minor fixes (See desc)

[yt-dlp.git] / yt_dlp / extractor / generic.py
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 65e803dd7061330030e4f7c3f8e0a9a17ea1ad5f..0d0e002e50cb929e18b7d8f08698a1eb5e802252 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -1,30 +1,126 @@
-# coding: utf-8
-
-from __future__ import unicode_literals
-
  import os
  import re
-import sys
+import xml.etree.ElementTree
  
+from .ant1newsgr import Ant1NewsGrEmbedIE
+from .anvato import AnvatoIE
+from .apa import APAIE
+from .arcpublishing import ArcPublishingIE
+from .arkena import ArkenaIE
+from .arte import ArteTVEmbedIE
+from .bitchute import BitChuteIE
+from .blogger import BloggerIE
+from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
+from .channel9 import Channel9IE
+from .cloudflarestream import CloudflareStreamIE
  from .common import InfoExtractor
+from .commonprotocols import RtmpIE
+from .condenast import CondeNastIE
+from .dailymail import DailyMailIE
+from .dailymotion import DailymotionIE
+from .dbtv import DBTVIE
+from .digiteka import DigitekaIE
+from .drtuber import DrTuberIE
+from .eagleplatform import EaglePlatformIE
+from .ertgr import ERTWebtvEmbedIE
+from .expressen import ExpressenIE
+from .facebook import FacebookIE
+from .foxnews import FoxNewsIE
+from .gedidigital import GediDigitalIE
+from .gfycat import GfycatIE
+from .glomex import GlomexEmbedIE
+from .googledrive import GoogleDriveIE
+from .indavideo import IndavideoEmbedIE
+from .instagram import InstagramIE
+from .joj import JojIE
+from .jwplatform import JWPlatformIE
+from .kaltura import KalturaIE
+from .kinja import KinjaEmbedIE
+from .limelight import LimelightBaseIE
+from .mainstreaming import MainStreamingIE
+from .medialaan import MedialaanIE
+from .mediaset import MediasetIE
+from .mediasite import MediasiteIE
+from .megaphone import MegaphoneIE
+from .megatvcom import MegaTVComEmbedIE
+from .mofosex import MofosexEmbedIE
+from .mtv import MTVServicesEmbeddedIE
+from .myvi import MyviIE
+from .nbc import NBCSportsVPlayerIE
+from .nexx import NexxEmbedIE, NexxIE
+from .odnoklassniki import OdnoklassnikiIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import OoyalaIE
+from .panopto import PanoptoBaseIE
+from .peertube import PeerTubeIE
+from .piksel import PikselIE
+from .pladform import PladformIE
+from .pornhub import PornHubIE
+from .rcs import RCSEmbedsIE
+from .redtube import RedTubeIE
+from .rumble import RumbleEmbedIE
+from .rutube import RutubeIE
+from .rutv import RUTVIE
+from .ruutu import RuutuIE
+from .senategov import SenateISVPIE
+from .simplecast import SimplecastIE
+from .soundcloud import SoundcloudEmbedIE
+from .spankwire import SpankwireIE
+from .sportbox import SportBoxIE
+from .spotify import SpotifyBaseIE
+from .springboardplatform import SpringboardPlatformIE
+from .svt import SVTIE
+from .teachable import TeachableIE
+from .ted import TedEmbedIE
+from .theplatform import ThePlatformIE
+from .threeqsdn import ThreeQSDNIE
+from .tnaflix import TNAFlixNetworkEmbedIE
+from .tube8 import Tube8IE
+from .tunein import TuneInBaseIE
+from .tvc import TVCIE
+from .tvopengr import TVOpenGrEmbedIE
+from .tvp import TVPEmbedIE
+from .twentymin import TwentyMinutenIE
+from .udn import UDNEmbedIE
+from .ustream import UstreamIE
+from .vbox7 import Vbox7IE
+from .vice import ViceIE
+from .videa import VideaIE
+from .videomore import VideomoreIE
+from .videopress import VideoPressIE
+from .viewlift import ViewLiftEmbedIE
+from .vimeo import VHXEmbedIE, VimeoIE
+from .viqeo import ViqeoIE
+from .vk import VKIE
+from .vshare import VShareIE
+from .vzaar import VzaarIE
+from .washingtonpost import WashingtonPostIE
+from .webcaster import WebcasterFeedIE
+from .wimtv import WimTVIE
+from .wistia import WistiaIE
+from .xfileshare import XFileShareIE
+from .xhamster import XHamsterEmbedIE
+from .yapfiles import YapFilesIE
+from .youporn import YouPornIE
  from .youtube import YoutubeIE
+from .zype import ZypeIE
  from ..compat import (
      compat_etree_fromstring,
      compat_str,
      compat_urllib_parse_unquote,
      compat_urlparse,
-    compat_xml_parse_error,
  )
  from ..utils import (
+    KNOWN_EXTENSIONS,
+    ExtractorError,
+    HEADRequest,
+    UnsupportedError,
      determine_ext,
      dict_get,
-    ExtractorError,
      float_or_none,
-    HEADRequest,
      int_or_none,
      is_html,
      js_to_json,
-    KNOWN_EXTENSIONS,
      merge_dicts,
      mimetype2ext,
      orderedSet,
@@ -36,119 +132,11 @@
      unescapeHTML,
      unified_timestamp,
      unsmuggle_url,
-    UnsupportedError,
      url_or_none,
      xpath_attr,
      xpath_text,
      xpath_with_ns,
  )
-from .commonprotocols import RtmpIE
-from .brightcove import (
-    BrightcoveLegacyIE,
-    BrightcoveNewIE,
-)
-from .nexx import (
-    NexxIE,
-    NexxEmbedIE,
-)
-from .nbc import NBCSportsVPlayerIE
-from .ooyala import OoyalaIE
-from .rutv import RUTVIE
-from .tvc import TVCIE
-from .sportbox import SportBoxIE
-from .myvi import MyviIE
-from .condenast import CondeNastIE
-from .udn import UDNEmbedIE
-from .senategov import SenateISVPIE
-from .svt import SVTIE
-from .pornhub import PornHubIE
-from .xhamster import XHamsterEmbedIE
-from .tnaflix import TNAFlixNetworkEmbedIE
-from .drtuber import DrTuberIE
-from .redtube import RedTubeIE
-from .tube8 import Tube8IE
-from .mofosex import MofosexEmbedIE
-from .spankwire import SpankwireIE
-from .youporn import YouPornIE
-from .vimeo import (
-    VimeoIE,
-    VHXEmbedIE,
-)
-from .dailymotion import DailymotionIE
-from .dailymail import DailyMailIE
-from .onionstudios import OnionStudiosIE
-from .viewlift import ViewLiftEmbedIE
-from .mtv import MTVServicesEmbeddedIE
-from .pladform import PladformIE
-from .videomore import VideomoreIE
-from .webcaster import WebcasterFeedIE
-from .googledrive import GoogleDriveIE
-from .jwplatform import JWPlatformIE
-from .digiteka import DigitekaIE
-from .arkena import ArkenaIE
-from .instagram import InstagramIE
-from .threeqsdn import ThreeQSDNIE
-from .theplatform import ThePlatformIE
-from .kaltura import KalturaIE
-from .eagleplatform import EaglePlatformIE
-from .facebook import FacebookIE
-from .soundcloud import SoundcloudEmbedIE
-from .tunein import TuneInBaseIE
-from .vbox7 import Vbox7IE
-from .dbtv import DBTVIE
-from .piksel import PikselIE
-from .videa import VideaIE
-from .twentymin import TwentyMinutenIE
-from .ustream import UstreamIE
-from .arte import ArteTVEmbedIE
-from .videopress import VideoPressIE
-from .rutube import RutubeIE
-from .glomex import GlomexEmbedIE
-from .megatvcom import MegaTVComEmbedIE
-from .ant1newsgr import Ant1NewsGrEmbedIE
-from .limelight import LimelightBaseIE
-from .anvato import AnvatoIE
-from .washingtonpost import WashingtonPostIE
-from .wistia import WistiaIE
-from .mediaset import MediasetIE
-from .joj import JojIE
-from .megaphone import MegaphoneIE
-from .vzaar import VzaarIE
-from .channel9 import Channel9IE
-from .vshare import VShareIE
-from .mediasite import MediasiteIE
-from .springboardplatform import SpringboardPlatformIE
-from .ted import TedEmbedIE
-from .yapfiles import YapFilesIE
-from .vice import ViceIE
-from .xfileshare import XFileShareIE
-from .cloudflarestream import CloudflareStreamIE
-from .peertube import PeerTubeIE
-from .teachable import TeachableIE
-from .indavideo import IndavideoEmbedIE
-from .apa import APAIE
-from .foxnews import FoxNewsIE
-from .viqeo import ViqeoIE
-from .expressen import ExpressenIE
-from .zype import ZypeIE
-from .odnoklassniki import OdnoklassnikiIE
-from .vk import VKIE
-from .kinja import KinjaEmbedIE
-from .gedidigital import GediDigitalIE
-from .rcs import RCSEmbedsIE
-from .bitchute import BitChuteIE
-from .rumble import RumbleEmbedIE
-from .arcpublishing import ArcPublishingIE
-from .medialaan import MedialaanIE
-from .simplecast import SimplecastIE
-from .wimtv import WimTVIE
-from .tvopengr import TVOpenGrEmbedIE
-from .ertgr import ERTWebtvEmbedIE
-from .tvp import TVPEmbedIE
-from .blogger import BloggerIE
-from .mainstreaming import MainStreamingIE
-from .gfycat import GfycatIE
-from .panopto import PanoptoBaseIE
  
  
  class GenericIE(InfoExtractor):
@@ -1042,20 +1030,6 @@ class GenericIE(InfoExtractor):
                  'filesize': 24687186,
              },
          },
-        {
-            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
-            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
-            'info_dict': {
-                'id': 'uxjb0lwrcz',
-                'ext': 'mp4',
-                'title': 'Conversation about Hexagonal Rails Part 1',
-                'description': 'a Martin Fowler video from ThoughtWorks',
-                'duration': 1715.0,
-                'uploader': 'thoughtworks.wistia.com',
-                'timestamp': 1401832161,
-                'upload_date': '20140603',
-            },
-        },
          # Wistia standard embed (async)
          {
              'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
@@ -2511,7 +2485,47 @@ class GenericIE(InfoExtractor):
                  'id': 'insert-a-quiz-into-a-panopto-video'
              },
              'playlist_count': 1
-        }
+        },
+        {
+            # Ruutu embed
+            'url': 'https://www.nelonen.fi/ohjelmat/madventures-suomi/2160731-riku-ja-tunna-lahtevat-peurajahtiin-tv-sta-tutun-biologin-kanssa---metsastysreissu-huipentuu-kasvissyojan-painajaiseen',
+            'md5': 'a2513a98d3496099e6eced40f7e6a14b',
+            'info_dict': {
+                'id': '4044426',
+                'ext': 'mp4',
+                'title': 'Riku ja Tunna lähtevät peurajahtiin tv:stä tutun biologin kanssa – metsästysreissu huipentuu kasvissyöjän painajaiseen!',
+                'thumbnail': r're:^https?://.+\.jpg$',
+                'duration': 108,
+                'series': 'Madventures Suomi',
+                'description': 'md5:aa55b44bd06a1e337a6f1d0b46507381',
+                'categories': ['Matkailu', 'Elämäntyyli'],
+                'age_limit': 0,
+                'upload_date': '20220308',
+            },
+        },
+        {
+            # Multiple Ruutu embeds
+            'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html',
+            'info_dict': {
+                'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä',
+                'id': 'art-2000008762560'
+            },
+            'playlist_count': 3
+        },
+        {
+            # Ruutu embed in hs.fi with a single video
+            'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html',
+            'md5': 'f8964e65d8fada6e8a562389bf366bb4',
+            'info_dict': {
+                'id': '4081841',
+                'ext': 'mp4',
+                'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022',
+                'thumbnail': r're:^https?://.+\.jpg$',
+                'duration': 138,
+                'age_limit': 0,
+                'upload_date': '20220504',
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2611,7 +2625,7 @@ def _extract_camtasia(self, url, video_id, webpage):
  
              entries.append({
                  'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
-                'title': '%s - %s' % (title, n.tag),
+                'title': f'{title} - {n.tag}',
                  'url': compat_urlparse.urljoin(url, url_n.text),
                  'duration': float_or_none(n.find('./duration').text),
              })
@@ -2633,7 +2647,7 @@ def _kvs_getrealurl(self, video_url, license_code):
  
          for o in range(len(newmagic) - 1, -1, -1):
              new = ''
-            l = (o + sum([int(n) for n in license[o:]])) % 32
+            l = (o + sum(int(n) for n in license[o:])) % 32
  
              for i in range(0, len(newmagic)):
                  if i == o:
@@ -2810,7 +2824,7 @@ def _real_extract(self, url):
          try:
              try:
                  doc = compat_etree_fromstring(webpage)
-            except compat_xml_parse_error:
+            except xml.etree.ElementTree.ParseError:
                  doc = compat_etree_fromstring(webpage.encode('utf-8'))
              if doc.tag == 'rss':
                  self.report_detected('RSS feed')
@@ -2845,7 +2859,7 @@ def _real_extract(self, url):
                  self.report_detected('F4M manifest')
                  self._sort_formats(info_dict['formats'])
                  return info_dict
-        except compat_xml_parse_error:
+        except xml.etree.ElementTree.ParseError:
              pass
  
          # Is it a Camtasia project?
@@ -3160,6 +3174,11 @@ def _real_extract(self, url):
          if sportbox_urls:
              return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
  
+        # Look for embedded Spotify player
+        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+        if spotify_urls:
+            return self.playlist_from_matches(spotify_urls, video_id, video_title)
+
          # Look for embedded XHamster player
          xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
          if xhamster_urls:
@@ -3737,6 +3756,12 @@ def _real_extract(self, url):
          panopto_urls = PanoptoBaseIE._extract_urls(webpage)
          if panopto_urls:
              return self.playlist_from_matches(panopto_urls, video_id, video_title)
+
+        # Look for Ruutu embeds
+        ruutu_urls = RuutuIE._extract_urls(webpage)
+        if ruutu_urls:
+            return self.playlist_from_matches(ruutu_urls, video_id, video_title)
+
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
          if entries:
@@ -3749,7 +3774,7 @@ def _real_extract(self, url):
              else:
                  for num, entry in enumerate(entries, start=1):
                      entry.update({
-                        'id': '%s-%s' % (video_id, num),
+                        'id': f'{video_id}-{num}',
                          'title': '%s (%d)' % (video_title, num),
                      })
              for entry in entries:
@@ -3862,8 +3887,8 @@ def check_video(vurl):
              if RtmpIE.suitable(vurl):
                  return True
              vpath = compat_urlparse.urlparse(vurl).path
-            vext = determine_ext(vpath)
-            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
+            vext = determine_ext(vpath, None)
+            return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
  
          def filter_video(urls):
              return list(filter(check_video, urls))
@@ -3987,9 +4012,6 @@ def filter_video(urls):
                  # Look also in Refresh HTTP header
                  refresh_header = head_response.headers.get('Refresh')
                  if refresh_header:
-                    # In python 2 response HTTP headers are bytestrings
-                    if sys.version_info < (3, 0) and isinstance(refresh_header, str):
-                        refresh_header = refresh_header.decode('iso-8859-1')
                      found = re.search(REDIRECT_REGEX, refresh_header)
              if found:
                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))