[ie/ARDBetaMediathek] Fix series extraction (#8687)

[yt-dlp.git] / yt_dlp / extractor / ard.py
diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py

index 7ea339b3999ba79ae23274868dcacc9e3b4b0ec7..8ac926c917fbaac4b5c7aa00a2386d72304193f0 100644 (file)
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
  import json
  import re
  
@@ -16,6 +13,7 @@
      try_get,
      unified_strdate,
      unified_timestamp,
+    update_url,
      update_url_query,
      url_or_none,
      xpath_text,
@@ -43,14 +41,15 @@ def _parse_media_info(self, media_info, video_id, fsk):
                      'This video is not available due to geoblocking',
                      countries=self._GEO_COUNTRIES, metadata_available=True)
  
-        self._sort_formats(formats)
-
          subtitles = {}
          subtitle_url = media_info.get('_subtitleUrl')
          if subtitle_url:
              subtitles['de'] = [{
                  'ext': 'ttml',
                  'url': subtitle_url,
+            }, {
+                'ext': 'vtt',
+                'url': subtitle_url.replace('/ebutt/', '/webvtt/') + '.vtt',
              }]
  
          return {
@@ -265,7 +264,6 @@ def _real_extract(self, url):
                      'format_id': fid,
                      'url': furl,
                  })
-            self._sort_formats(formats)
              info = {
                  'formats': formats,
              }
@@ -292,16 +290,16 @@ def _real_extract(self, url):
  class ARDIE(InfoExtractor):
      _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
      _TESTS = [{
-        # available till 7.01.2022
-        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
-        'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
+        # available till 7.12.2023
+        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-video-424.html',
+        'md5': '94812e6438488fb923c361a44469614b',
          'info_dict': {
-            'id': 'maischberger-die-woche-video100',
-            'display_id': 'maischberger-die-woche-video100',
+            'id': 'maischberger-video-424',
+            'display_id': 'maischberger-video-424',
              'ext': 'mp4',
-            'duration': 3687.0,
-            'title': 'maischberger. die woche vom 7. Januar 2021',
-            'upload_date': '20210107',
+            'duration': 4452.0,
+            'title': 'maischberger am 07.12.2022',
+            'upload_date': '20221207',
              'thumbnail': r're:^https?://.*\.jpg$',
          },
      }, {
@@ -374,7 +372,6 @@ def _real_extract(self, url):
                      continue
                  f['url'] = format_url
              formats.append(f)
-        self._sort_formats(formats)
  
          _SUB_FORMATS = (
              ('./dataTimedText', 'ttml'),
@@ -406,12 +403,28 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
      _VALID_URL = r'''(?x)https://
          (?:(?:beta|www)\.)?ardmediathek\.de/
          (?:(?P<client>[^/]+)/)?
-        (?:player|live|video|(?P<playlist>sendung|sammlung))/
+        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
          (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
          (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
          (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
  
      _TESTS = [{
+        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
+        'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
+        'info_dict': {
+            'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
+            'id': '12939099',
+            'title': 'Liebe auf vier Pfoten',
+            'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
+            'duration': 5222,
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:aee7cbf8f06de976?w=960&ch=ae4d0f2ee47d8b9b',
+            'timestamp': 1701343800,
+            'upload_date': '20231130',
+            'ext': 'mp4',
+            'episode': 'Liebe auf vier Pfoten',
+            'series': 'Filme im MDR'
+        },
+    }, {
          'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
          'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
          'info_dict': {
@@ -428,7 +441,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
          'skip': 'Error',
      }, {
          'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
-        'md5': 'f1837e563323b8a642a8ddeff0131f51',
+        'md5': '1e73ded21cb79bac065117e80c81dc88',
          'info_dict': {
              'id': '10049223',
              'ext': 'mp4',
@@ -436,13 +449,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
              'timestamp': 1636398000,
              'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b',
              'upload_date': '20211108',
-        },
-    }, {
-        'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1',
-        'playlist_count': 6,
-        'info_dict': {
-            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw',
-            'title': 'beforeigners/beforeigners/staffel-1',
+            'display_id': 'tagesschau-oder-tagesschau-20-00-uhr/das-erste',
+            'duration': 915,
+            'episode': 'tagesschau, 20:00 Uhr',
+            'series': 'tagesschau',
+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
          },
      }, {
          'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@@ -463,6 +474,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
          # playlist of type 'sendung'
          'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
          'only_matching': True,
+    }, {
+        # playlist of type 'serie'
+        'url': 'https://www.ardmediathek.de/serie/nachtstreife/staffel-1/Y3JpZDovL3N3ci5kZS9zZGIvc3RJZC8xMjQy/1',
+        'only_matching': True,
      }, {
          # playlist of type 'sammlung'
          'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
@@ -475,10 +490,11 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
          'only_matching': True,
      }]
  
-    def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
+    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
          """ Query the ARD server for playlist information
          and returns the data in "raw" format """
-        if mode == 'sendung':
+        assert mode in ('sendung', 'serie', 'sammlung')
+        if mode in ('sendung', 'serie'):
              graphQL = json.dumps({
                  'query': '''{
                      showPage(
@@ -495,7 +511,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
                              links { target { id href title } }
                              type
                          }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
              }).encode()
          else:  # mode == 'sammlung'
              graphQL = json.dumps({
@@ -516,7 +532,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
                                  type
                              }
                          }
-                    }}''' % (client, playlist_id, pageNumber),
+                    }}''' % (client, playlist_id, page_number),
              }).encode()
          # Ressources for ARD graphQL debugging:
          # https://api-test.ardmediathek.de/public-gateway
@@ -526,7 +542,7 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
              data=graphQL,
              headers={'Content-Type': 'application/json'})['data']
          # align the structure of the returned data:
-        if mode == 'sendung':
+        if mode in ('sendung', 'serie'):
              show_page = show_page['showPage']
          else:  # mode == 'sammlung'
              show_page = show_page['morePage']['widget']
@@ -534,12 +550,12 @@ def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, page
  
      def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
          """ Collects all playlist entries and returns them as info dict.
-        Supports playlists of mode 'sendung' and 'sammlung', and also nested
-        playlists. """
+        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
+        as well as nested playlists. """
          entries = []
          pageNumber = 0
          while True:  # iterate by pageNumber
-            show_page = self._ARD_load_playlist_snipped(
+            show_page = self._ARD_load_playlist_snippet(
                  playlist_id, display_id, client, mode, pageNumber)
              for teaser in show_page['teasers']:  # process playlist items
                  if '/compilation/' in teaser['links']['target']['href']:
@@ -606,6 +622,9 @@ def _real_extract(self, url):
      show {
        title
      }
+    image {
+      src
+    }
      synopsis
      title
      tracking {
@@ -644,6 +663,15 @@ def _real_extract(self, url):
              'description': description,
              'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
              'series': try_get(player_page, lambda x: x['show']['title']),
+            'thumbnail': (media_collection.get('_previewImage')
+                          or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
+                          or self.get_thumbnail_from_html(display_id, url)),
          })
          info.update(self._ARD_extract_episode_info(info['title']))
          return info
+
+    def get_thumbnail_from_html(self, display_id, url):
+        webpage = self._download_webpage(url, display_id, fatal=False) or ''
+        return (
+            self._og_search_thumbnail(webpage, default=None)
+            or self._html_search_meta('thumbnailUrl', webpage, default=None))