Update to ytdl-commit-cf2dbec

[yt-dlp.git] / youtube_dlc / extractor / generic.py
diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py

index 7711de01b76fc703d9b2003b50a2baa0a3454e25..8cde11d2b8d8ba8ff87e9289d4aa1fec822b81d0 100644 (file)
--- a/youtube_dlc/extractor/generic.py
+++ b/youtube_dlc/extractor/generic.py
@@ -20,19 +20,24 @@
      ExtractorError,
      float_or_none,
      HEADRequest,
+    int_or_none,
      is_html,
      js_to_json,
      KNOWN_EXTENSIONS,
      merge_dicts,
      mimetype2ext,
      orderedSet,
+    parse_duration,
      sanitized_Request,
      smuggle_url,
      unescapeHTML,
-    unified_strdate,
+    unified_timestamp,
      unsmuggle_url,
      UnsupportedError,
+    url_or_none,
+    xpath_attr,
      xpath_text,
+    xpath_with_ns,
  )
  from .commonprotocols import RtmpIE
  from .brightcove import (
@@ -48,7 +53,6 @@
  from .rutv import RUTVIE
  from .tvc import TVCIE
  from .sportbox import SportBoxIE
-from .smotri import SmotriIE
  from .myvi import MyviIE
  from .condenast import CondeNastIE
  from .udn import UDNEmbedIE
@@ -63,7 +67,10 @@
  from .mofosex import MofosexEmbedIE
  from .spankwire import SpankwireIE
  from .youporn import YouPornIE
-from .vimeo import VimeoIE
+from .vimeo import (
+    VimeoIE,
+    VHXEmbedIE,
+)
  from .dailymotion import DailymotionIE
  from .dailymail import DailyMailIE
  from .onionstudios import OnionStudiosIE
@@ -120,7 +127,13 @@
  from .zype import ZypeIE
  from .odnoklassniki import OdnoklassnikiIE
  from .kinja import KinjaEmbedIE
+from .gedi import GediEmbedsIE
+from .rcs import RCSEmbedsIE
  from .bitchute import BitChuteIE
+from .rumble import RumbleEmbedIE
+from .arcpublishing import ArcPublishingIE
+from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
  
  
  class GenericIE(InfoExtractor):
@@ -199,11 +212,46 @@ class GenericIE(InfoExtractor):
          {
              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
              'info_dict': {
-                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-                'ext': 'm4v',
-                'upload_date': '20150228',
-                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-            }
+                'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+                'title': 'MSNBC Rachel Maddow (video)',
+                'description': 're:.*her unique approach to storytelling.*',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'ext': 'mov',
+                    'id': 'pdv_maddow_netcast_mov-12-03-2020-223726',
+                    'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726',
+                    'description': 're:.*her unique approach to storytelling.*',
+                    'upload_date': '20201204',
+                },
+            }],
+        },
+        # RSS feed with item with description and thumbnails
+        {
+            'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
+            'info_dict': {
+                'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
+                'title': 're:.*100% Hydrogen.*',
+                'description': 're:.*In this episode.*',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'ext': 'm4a',
+                    'id': 'c1c879525ce2cb640b344507e682c36d',
+                    'title': 're:Hydrogen!',
+                    'description': 're:.*In this episode we are going.*',
+                    'timestamp': 1567977776,
+                    'upload_date': '20190908',
+                    'duration': 459,
+                    'thumbnail': r're:^https?://.*\.jpg$',
+                    'episode_number': 1,
+                    'season_number': 1,
+                    'age_limit': 0,
+                },
+            }],
+            'params': {
+                'skip_download': True,
+            },
          },
          # RSS feed with enclosures and unsupported link URLs
          {
@@ -1984,22 +2032,6 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [SpringboardPlatformIE.ie_key()],
          },
-        {
-            'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
-            'info_dict': {
-                'id': 'uPDB5I9wfp8',
-                'ext': 'webm',
-                'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
-                'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
-                'upload_date': '20160219',
-                'uploader': 'Pocoyo - Português (BR)',
-                'uploader_id': 'PocoyoBrazil',
-            },
-            'add_ie': [YoutubeIE.ie_key()],
-            'params': {
-                'skip_download': True,
-            },
-        },
          {
              'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
              'info_dict': {
@@ -2104,23 +2136,23 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
-        {
-            # Zype embed
-            'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
-            'info_dict': {
-                'id': '5b400b834b32992a310622b9',
-                'ext': 'mp4',
-                'title': 'Smoky Barbecue Favorites',
-                'thumbnail': r're:^https?://.*\.jpe?g',
-                'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
-                'upload_date': '20170909',
-                'timestamp': 1504915200,
-            },
-            'add_ie': [ZypeIE.ie_key()],
-            'params': {
-                'skip_download': True,
-            },
-        },
+        # {
+        #     # Zype embed
+        #     'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+        #     'info_dict': {
+        #         'id': '5b400b834b32992a310622b9',
+        #         'ext': 'mp4',
+        #         'title': 'Smoky Barbecue Favorites',
+        #         'thumbnail': r're:^https?://.*\.jpe?g',
+        #         'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+        #         'upload_date': '20170909',
+        #         'timestamp': 1504915200,
+        #     },
+        #     'add_ie': [ZypeIE.ie_key()],
+        #     'params': {
+        #         'skip_download': True,
+        #     },
+        # },
          {
              # videojs embed
              'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
@@ -2169,7 +2201,55 @@ class GenericIE(InfoExtractor):
          #     'params': {
          #         'force_generic_extractor': True,
          #     },
-        # }
+        # },
+        {
+            # VHX Embed
+            'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy',
+            'info_dict': {
+                'id': '858208',
+                'ext': 'mp4',
+                'title': 'Untitled',
+                'uploader_id': 'user80538407',
+                'uploader': 'OTT Videos',
+            },
+        },
+        {
+            # ArcPublishing PoWa video player
+            'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/',
+            'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3',
+            'info_dict': {
+                'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
+                'ext': 'mp4',
+                'title': 'Senate candidates wave to voters on Anchorage streets',
+                'description': 'md5:91f51a6511f090617353dc720318b20e',
+                'timestamp': 1604378735,
+                'upload_date': '20201103',
+                'duration': 1581,
+            },
+        },
+        {
+            # MyChannels SDK embed
+            # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen
+            'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/',
+            'md5': '90c0699c37006ef18e198c032d81739c',
+            'info_dict': {
+                'id': '194165',
+                'ext': 'mp4',
+                'title': 'Burgemeester Aboutaleb spreekt relschoppers toe',
+                'timestamp': 1611740340,
+                'upload_date': '20210127',
+                'duration': 159,
+            },
+        },
+        {
+            # Simplecast player embed
+            'url': 'https://www.bio.org/podcast',
+            'info_dict': {
+                'id': 'podcast',
+                'title': 'I AM BIO Podcast | BIO',
+            },
+            'playlist_mincount': 52,
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2181,6 +2261,10 @@ def _extract_rss(self, url, video_id, doc):
          playlist_desc_el = doc.find('./channel/description')
          playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
  
+        NS_MAP = {
+            'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+        }
+
          entries = []
          for it in doc.findall('./channel/item'):
              next_url = None
@@ -2196,10 +2280,33 @@ def _extract_rss(self, url, video_id, doc):
              if not next_url:
                  continue
  
+            def itunes(key):
+                return xpath_text(
+                    it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
+                    default=None)
+
+            duration = itunes('duration')
+            explicit = (itunes('explicit') or '').lower()
+            if explicit in ('true', 'yes'):
+                age_limit = 18
+            elif explicit in ('false', 'no'):
+                age_limit = 0
+            else:
+                age_limit = None
+
              entries.append({
                  '_type': 'url_transparent',
                  'url': next_url,
                  'title': it.find('title').text,
+                'description': xpath_text(it, 'description', default=None),
+                'timestamp': unified_timestamp(
+                    xpath_text(it, 'pubDate', default=None)),
+                'duration': int_or_none(duration) or parse_duration(duration),
+                'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
+                'episode': itunes('title'),
+                'episode_number': int_or_none(itunes('episode')),
+                'season_number': int_or_none(itunes('season')),
+                'age_limit': age_limit,
              })
  
          return {
@@ -2319,7 +2426,7 @@ def _real_extract(self, url):
          info_dict = {
              'id': video_id,
              'title': self._generic_title(url),
-            'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
          }
  
          # Check for direct link to a video
@@ -2382,6 +2489,9 @@ def _real_extract(self, url):
          webpage = self._webpage_read_content(
              full_response, url, video_id, prefix=first_bytes)
  
+        if '<title>DPG Media Privacy Gate</title>' in webpage:
+            webpage = self._download_webpage(url, video_id)
+
          self.report_extraction(video_id)
  
          # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
@@ -2425,7 +2535,9 @@ def _real_extract(self, url):
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
-        webpage = compat_urllib_parse_unquote(webpage)
+        # FIXME: unescaping the whole page may break URLs, commenting out for now.
+        # There probably should be a second run of generic extractor on unescaped webpage.
+        # webpage = compat_urllib_parse_unquote(webpage)
  
          # Unescape squarespace embeds to be detected by generic extractor,
          # see https://github.com/ytdl-org/youtube-dl/issues/21294
@@ -2507,6 +2619,15 @@ def _real_extract(self, url):
          if tp_urls:
              return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
  
+        arc_urls = ArcPublishingIE._extract_urls(webpage)
+        if arc_urls:
+            return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
+
+        mychannels_urls = MedialaanIE._extract_urls(webpage)
+        if mychannels_urls:
+            return self.playlist_from_matches(
+                mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key())
+
          # Look for embedded rtl.nl player
          matches = re.findall(
              r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
@@ -2518,6 +2639,10 @@ def _real_extract(self, url):
          if vimeo_urls:
              return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
  
+        vhx_url = VHXEmbedIE._extract_url(webpage)
+        if vhx_url:
+            return self.url_result(vhx_url, VHXEmbedIE.ie_key())
+
          vid_me_embed_url = self._search_regex(
              r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
              webpage, 'vid.me embed', default=None)
@@ -2679,6 +2804,12 @@ def _real_extract(self, url):
              return self.playlist_from_matches(
                  matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
  
+        # Look for Simplecast embeds
+        simplecast_urls = SimplecastIE._extract_urls(webpage)
+        if simplecast_urls:
+            return self.playlist_from_matches(
+                simplecast_urls, video_id, video_title)
+
          # Look for BBC iPlayer embed
          matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
          if matches:
@@ -2773,11 +2904,6 @@ def _real_extract(self, url):
          if mobj is not None:
              return self.url_result(mobj.group('url'))
  
-        # Look for embedded smotri.com player
-        smotri_url = SmotriIE._extract_url(webpage)
-        if smotri_url:
-            return self.url_result(smotri_url, 'Smotri')
-
          # Look for embedded Myvi.ru player
          myvi_url = MyviIE._extract_url(webpage)
          if myvi_url:
@@ -3213,11 +3339,29 @@ def _real_extract(self, url):
              return self.playlist_from_matches(
                  zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
  
+        # Look for RCS media group embeds
+        gedi_urls = GediEmbedsIE._extract_urls(webpage)
+        if gedi_urls:
+            return self.playlist_from_matches(
+                gedi_urls, video_id, video_title, ie=GediEmbedsIE.ie_key())
+
+        rcs_urls = RCSEmbedsIE._extract_urls(webpage)
+        if rcs_urls:
+            return self.playlist_from_matches(
+                rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key())
+
          bitchute_urls = BitChuteIE._extract_urls(webpage)
          if bitchute_urls:
              return self.playlist_from_matches(
                  bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key())
  
+        rumble_urls = RumbleEmbedIE._extract_urls(webpage)
+        if len(rumble_urls) == 1:
+            return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key())
+        if rumble_urls:
+            return self.playlist_from_matches(
+                rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
+
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
          if entries: