Merge branch 'ard.py_add_playlist_support' of https://github.com/martin54/youtube...

author Tom-Oliver Heidel <redacted>

Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)

committer Tom-Oliver Heidel <redacted>

Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)
author Tom-Oliver Heidel <redacted>
Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)
committer Tom-Oliver Heidel <redacted>
Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)
diff --combined youtube_dlc/extractor/ard.py

index 5b7b2dd6d2ee2427d0df3920f982b3eece6d67fc,6f1e477a90322980e4e8368f9e0e5b1eb921d7b3..6f1e477a90322980e4e8368f9e0e5b1eb921d7b3
--- 1/youtube_dlc/extractor/ard.py
--- 2/youtube_dl/extractor/ard.py
+++ b/youtube_dlc/extractor/ard.py
@@@ -62,6 -62,45 +62,45 @@@ class ARDMediathekBaseIE(InfoExtractor)
               'subtitles': subtitles,
           }
   
+     def _ARD_extract_episode_info(self, title):
+         """Try to extract season/episode data from the title."""
+         res = {}
+         if not title:
+             return res
+ 
+         for pattern in [
+             # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
+             # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
+             r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
+             # E.g.: title="Fritjof aus Norwegen (2) (AD)"
+             # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
+             r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
+             r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
+             # E.g.: title="Folge 25/42: Symmetrie"
+             # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
+             # E.g.: title="Folge 1063 - Vertrauen"
+             # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
+             r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
+         ]:
+             m = re.match(pattern, title)
+             if m:
+                 groupdict = m.groupdict()
+                 res['season_number'] = int_or_none(groupdict.get('season_number'))
+                 res['episode_number'] = int_or_none(groupdict.get('episode_number'))
+                 res['episode'] = str_or_none(groupdict.get('episode'))
+                 # Build the episode title by removing numeric episode information:
+                 if groupdict.get('ep_info') and not res['episode']:
+                     res['episode'] = str_or_none(
+                         title.replace(groupdict.get('ep_info'), ''))
+                 if res['episode']:
+                     res['episode'] = res['episode'].strip()
+                 break
+ 
+         # As a fallback use the whole title as the episode name:
+         if not res.get('episode'):
+             res['episode'] = title.strip()
+         return res
+ 
       def _extract_formats(self, media_info, video_id):
           type_ = media_info.get('_type')
           media_array = media_info.get('_mediaArray', [])
@@@ -244,6 -283,7 +283,7 @@@ class ARDMediathekIE(ARDMediathekBaseIE
               'description': description,
               'thumbnail': thumbnail,
           })
+         info.update(self._ARD_extract_episode_info(info['title']))
   
           return info
   
@@@ -313,7 -353,7 +353,7 @@@ class ARDIE(InfoExtractor)
   
   
   class ARDBetaMediathekIE(ARDMediathekBaseIE):
-     _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+     _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
       _TESTS = [{
           'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
           'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
@@@ -343,8 -383,112 +383,112 @@@
       }, {
           'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
           'only_matching': True,
+     }, {
+         # playlist of type 'sendung'
+         'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
+         'only_matching': True,
+     }, {
+         # playlist of type 'sammlung'
+         'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
+         'only_matching': True,
       }]
   
+     def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber):
+         """ Query the ARD server for playlist information
+         and returns the data in "raw" format """
+         if mode == 'sendung':
+             graphQL = json.dumps({
+                 'query': '''{
+                     showPage(
+                         client: "%s"
+                         showId: "%s"
+                         pageNumber: %d
+                     ) {
+                         pagination {
+                             pageSize
+                             totalElements
+                         }
+                         teasers {        # Array
+                             mediumTitle
+                             links { target { id href title } }
+                             type
+                         }
+                     }}''' % (client, playlist_id, pageNumber),
+             }).encode()
+         else:  # mode == 'sammlung'
+             graphQL = json.dumps({
+                 'query': '''{
+                     morePage(
+                         client: "%s"
+                         compilationId: "%s"
+                         pageNumber: %d
+                     ) {
+                         widget {
+                             pagination {
+                                 pageSize
+                                 totalElements
+                             }
+                             teasers {        # Array
+                                 mediumTitle
+                                 links { target { id href title } }
+                                 type
+                             }
+                         }
+                     }}''' % (client, playlist_id, pageNumber),
+             }).encode()
+         # Ressources for ARD graphQL debugging:
+         # https://api-test.ardmediathek.de/public-gateway
+         show_page = self._download_json(
+             'https://api.ardmediathek.de/public-gateway',
+             '[Playlist] %s' % display_id,
+             data=graphQL,
+             headers={'Content-Type': 'application/json'})['data']
+         # align the structure of the returned data:
+         if mode == 'sendung':
+             show_page = show_page['showPage']
+         else:  # mode == 'sammlung'
+             show_page = show_page['morePage']['widget']
+         return show_page
+ 
+     def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
+         """ Collects all playlist entries and returns them as info dict.
+         Supports playlists of mode 'sendung' and 'sammlung', and also nested
+         playlists. """
+         entries = []
+         pageNumber = 0
+         while True:  # iterate by pageNumber
+             show_page = self._ARD_load_playlist_snipped(
+                 playlist_id, display_id, client, mode, pageNumber)
+             for teaser in show_page['teasers']:  # process playlist items
+                 if '/compilation/' in teaser['links']['target']['href']:
+                     # alternativ cond.: teaser['type'] == "compilation"
+                     # => This is an nested compilation, e.g. like:
+                     # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
+                     link_mode = 'sammlung'
+                 else:
+                     link_mode = 'video'
+ 
+                 item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
+                     client, link_mode, display_id,
+                     # perform HTLM quoting of episode title similar to ARD:
+                     re.sub('^-|-$', '',  # remove '-' from begin/end
+                            re.sub('[^a-zA-Z0-9]+', '-',  # replace special chars by -
+                                   teaser['links']['target']['title'].lower()
+                                   .replace('ä', 'ae').replace('ö', 'oe')
+                                   .replace('ü', 'ue').replace('ß', 'ss'))),
+                     teaser['links']['target']['id'])
+                 entries.append(self.url_result(
+                     item_url,
+                     ie=ARDBetaMediathekIE.ie_key()))
+ 
+             if (show_page['pagination']['pageSize'] * (pageNumber + 1)
+                >= show_page['pagination']['totalElements']):
+                 # we've processed enough pages to get all playlist entries
+                 break
+             pageNumber = pageNumber + 1
+ 
+         return self.playlist_result(entries, playlist_title=display_id)
+ 
       def _real_extract(self, url):
           mobj = re.match(self._VALID_URL, url)
           video_id = mobj.group('video_id')
@@@ -354,6 -498,13 +498,13 @@@
           if not display_id:
               display_id = video_id
   
+         if mobj.group('mode') in ('sendung', 'sammlung'):
+             # this is a playlist-URL
+             return self._ARD_extract_playlist(
+                 url, video_id, display_id,
+                 mobj.group('client'),
+                 mobj.group('mode'))
+ 
           player_page = self._download_json(
               'https://api.ardmediathek.de/public-gateway',
               display_id, data=json.dumps({
@@@ -419,4 -570,5 +570,5 @@@
               'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
               'series': try_get(player_page, lambda x: x['show']['title']),
           })
+         info.update(self._ARD_extract_episode_info(info['title']))
           return info
author	Tom-Oliver Heidel <redacted>
	Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)
committer	Tom-Oliver Heidel <redacted>
	Tue, 8 Sep 2020 16:31:53 +0000 (18:31 +0200)