[extractor] Deprecate `_sort_formats`

[yt-dlp.git] / yt_dlp / extractor / ard.py
diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py

index 4d90be7140d4f8403d5329ed6416c7cff2d014ac..0a8a8746abcdec300bd6e719a62f0b688f03340f 100644 (file)
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import json
  import re
  
@@ -43,8 +40,6 @@ def _parse_media_info(self, media_info, video_id, fsk):
                      'This video is not available due to geoblocking',
                      countries=self._GEO_COUNTRIES, metadata_available=True)
  
-        self._sort_formats(formats)
-
          subtitles = {}
          subtitle_url = media_info.get('_subtitleUrl')
          if subtitle_url:
@@ -199,7 +194,7 @@ def suitable(cls, url):
  
      def _real_extract(self, url):
          # determine video id from url
-        m = re.match(self._VALID_URL, url)
+        m = self._match_valid_url(url)
  
          document_id = None
  
@@ -265,7 +260,6 @@ def _real_extract(self, url):
                      'format_id': fid,
                      'url': furl,
                  })
-            self._sort_formats(formats)
              info = {
                  'formats': formats,
              }
@@ -280,7 +274,7 @@ def _real_extract(self, url):
  
          info.update({
              'id': video_id,
-            'title': self._live_title(title) if info.get('is_live') else title,
+            'title': title,
              'description': description,
              'thumbnail': thumbnail,
          })
@@ -290,14 +284,14 @@ def _real_extract(self, url):
  
  
  class ARDIE(InfoExtractor):
-    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?:video-?)?(?P<id>[0-9]+))\.html'
+    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
      _TESTS = [{
          # available till 7.01.2022
          'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
          'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
          'info_dict': {
-            'display_id': 'maischberger-die-woche',
-            'id': '100',
+            'id': 'maischberger-die-woche-video100',
+            'display_id': 'maischberger-die-woche-video100',
              'ext': 'mp4',
              'duration': 3687.0,
              'title': 'maischberger. die woche vom 7. Januar 2021',
@@ -305,16 +299,28 @@ class ARDIE(InfoExtractor):
              'thumbnail': r're:^https?://.*\.jpg$',
          },
      }, {
-        'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+        'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/videos/diversity-tag-sanam-afrashteh100.html',
          'only_matching': True,
      }, {
          'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
          'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
+        mobj = self._match_valid_url(url)
+        display_id = mobj.group('id')
  
          player_url = mobj.group('mainurl') + '~playerXml.xml'
          doc = self._download_xml(player_url, display_id)
@@ -362,11 +368,25 @@ def _real_extract(self, url):
                      continue
                  f['url'] = format_url
              formats.append(f)
-        self._sort_formats(formats)
+
+        _SUB_FORMATS = (
+            ('./dataTimedText', 'ttml'),
+            ('./dataTimedTextNoOffset', 'ttml'),
+            ('./dataTimedTextVtt', 'vtt'),
+        )
+
+        subtitles = {}
+        for subsel, subext in _SUB_FORMATS:
+            for node in video_node.findall(subsel):
+                subtitles.setdefault('de', []).append({
+                    'url': node.attrib['url'],
+                    'ext': subext,
+                })
  
          return {
-            'id': mobj.group('id'),
+            'id': xpath_text(video_node, './videoId', default=display_id),
              'formats': formats,
+            'subtitles': subtitles,
              'display_id': display_id,
              'title': video_node.find('./title').text,
              'duration': parse_duration(video_node.find('./duration').text),
@@ -376,7 +396,14 @@ def _real_extract(self, url):
  
  
  class ARDBetaMediathekIE(ARDMediathekBaseIE):
-    _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+    _VALID_URL = r'''(?x)https://
+        (?:(?:beta|www)\.)?ardmediathek\.de/
+        (?:(?P<client>[^/]+)/)?
+        (?:player|live|video|(?P<playlist>sendung|sammlung))/
+        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+
      _TESTS = [{
          'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
          'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
@@ -391,6 +418,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
              'upload_date': '20200805',
              'ext': 'mp4',
          },
+        'skip': 'Error',
+    }, {
+        'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
+        'md5': 'f1837e563323b8a642a8ddeff0131f51',
+        'info_dict': {
+            'id': '10049223',
+            'ext': 'mp4',
+            'title': 'tagesschau, 20:00 Uhr',
+            'timestamp': 1636398000,
+            'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
+            'upload_date': '20211108',
+        },
+    }, {
+        'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
+        'playlist_count': 6,
+        'info_dict': {
+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
+            'title': 'beforeigners/beforeigners/staffel-1',
+        },
      }, {
          'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
          'only_matching': True,
@@ -414,6 +460,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
          # playlist of type 'sammlung'
          'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
          'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+        'only_matching': True,
      }]
  
      def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
@@ -510,23 +562,16 @@ def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
                  break
              pageNumber = pageNumber + 1
  
-        return self.playlist_result(entries, playlist_title=display_id)
+        return self.playlist_result(entries, playlist_id, playlist_title=display_id)
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('video_id')
-        display_id = mobj.group('display_id')
-        if display_id:
-            display_id = display_id.rstrip('/')
-        if not display_id:
-            display_id = video_id
-
-        if mobj.group('mode') in ('sendung', 'sammlung'):
-            # this is a playlist-URL
-            return self._ARD_extract_playlist(
-                url, video_id, display_id,
-                mobj.group('client'),
-                mobj.group('mode'))
+        video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+            'id', 'display_id', 'playlist', 'client', 'season')
+        display_id, client = display_id or video_id, client or 'ard'
+
+        if playlist_type:
+            # TODO: Extract only specified season
+            return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
  
          player_page = self._download_json(
              'https://api.ardmediathek.de/public-gateway',
@@ -562,7 +607,7 @@ def _real_extract(self, url):
        }
      }
    }
-}''' % (mobj.group('client'), video_id),
+}''' % (client, video_id),
              }).encode(), headers={
                  'Content-Type': 'application/json'
              })['data']['playerPage']