Update to ytdl-2021.01.08

author pukkandan <redacted>

Fri, 8 Jan 2021 16:14:50 +0000 (21:44 +0530)

committer pukkandan <redacted>

Fri, 8 Jan 2021 16:29:10 +0000 (21:59 +0530)
author pukkandan <redacted>
Fri, 8 Jan 2021 16:14:50 +0000 (21:44 +0530)
committer pukkandan <redacted>
Fri, 8 Jan 2021 16:29:10 +0000 (21:59 +0530)
diff --git a/docs/supportedsites.md b/docs/supportedsites.md

index 54911fcc5629efe81b9422bcc22fd6f41b089717..e1c04d319a730b7fe649fe5ed6fff1e6e458a507 100644 (file)
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -55,6 +55,7 @@ # Supported sites
   - **Aparat**
   - **AppleConnect**
   - **AppleDaily**: 臺灣蘋果日報
+ - **ApplePodcasts**
   - **appletrailers**
   - **appletrailers:section**
   - **archive.org**: archive.org videos
@@ -99,6 +100,10 @@ # Supported sites
   - **BellMedia**
   - **Bet**
   - **bfi:player**
+ - **bfmtv**
+ - **bfmtv:article**
+ - **bfmtv:live**
+ - **BibelTV**
   - **Bigflix**
   - **Bild**: Bild.de
   - **BiliBili**
@@ -346,6 +351,8 @@ # Supported sites
   - **Go**
   - **GodTube**
   - **Golem**
+ - **google:podcasts**
+ - **google:podcasts:feed**
   - **GoogleDrive**
   - **Goshgay**
   - **GPUTechConf**
@@ -381,6 +388,8 @@ # Supported sites
   - **HungamaSong**
   - **Hypem**
   - **ign.com**
+ - **IHeartRadio**
+ - **iheartradio:podcast**
   - **imdb**: Internet Movie Database trailers
   - **imdb:list**: Internet Movie Database lists
   - **Imgur**
@@ -706,7 +715,6 @@ # Supported sites
   - **Playwire**
   - **pluralsight**
   - **pluralsight:course**
- - **plus.google**: Google Plus
   - **podomatic**
   - **Pokemon**
   - **PokemonWatch**
@@ -1146,7 +1154,7 @@ # Supported sites
   - **WWE**
   - **XBef**
   - **XboxClips**
- - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing
+ - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing
   - **XHamster**
   - **XHamsterEmbed**
   - **XHamsterUser**
diff --git a/test/test_subtitles.py b/test/test_subtitles.py

index 86e20cb4be444c4ab24924eb31e3fa7a266041fd..0014d57b69f00552cf005ee509205aba6e853d11 100644 (file)
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -264,16 +264,24 @@ def test_allsubtitles(self):
  
  
  class TestRaiPlaySubtitles(BaseTestSubtitles):
-    url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
      IE = RaiPlayIE
  
-    def test_allsubtitles(self):
+    def test_subtitles_key(self):
+        self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
          self.DL.params['writesubtitles'] = True
          self.DL.params['allsubtitles'] = True
          subtitles = self.getSubtitles()
          self.assertEqual(set(subtitles.keys()), set(['it']))
          self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a')
  
+    def test_subtitles_array_key(self):
+        self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html'
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(set(subtitles.keys()), set(['it']))
+        self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd')
+
  
  class TestVikiSubtitles(BaseTestSubtitles):
      url = 'http://www.viki.com/videos/1060846v-punch-episode-18'
diff --git a/test/test_utils.py b/test/test_utils.py

index bb69b052204d36e0112947261c286d6cea6f7fc5..a0f78ebe1430deeb8d6e12be39cc2d359a571e22 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -21,6 +21,7 @@
      encode_base_n,
      caesar,
      clean_html,
+    clean_podcast_url,
      date_from_str,
      DateRange,
      detect_exe_version,
@@ -1497,6 +1498,10 @@ def test_iri_to_uri(self):
              iri_to_uri('http://导航.中国/'),
              'http://xn--fet810g.xn--fiqs8s/')
  
+    def test_clean_podcast_url(self):
+        self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
+        self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dlc/downloader/hls.py b/youtube_dlc/downloader/hls.py

index 5e1ff4f6b2eda06d998851b6dbfe610e232a68be..7aaebc9400e2a9f4962d5c47ccbc85aaaf1ee634 100644 (file)
--- a/youtube_dlc/downloader/hls.py
+++ b/youtube_dlc/downloader/hls.py
@@ -172,8 +172,12 @@ def is_ad_fragment_end(s):
                          iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
                          decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
                              self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
-                        frag_content = AES.new(
-                            decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
+                        # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
+                        # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
+                        # not what it decrypts to.
+                        if not test:
+                            frag_content = AES.new(
+                                decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
                      self._append_fragment(ctx, frag_content)
                      # We only download the first fragment during the test
                      if test:
diff --git a/youtube_dlc/extractor/acast.py b/youtube_dlc/extractor/acast.py

index 60378db1be3eeabb1a2907754588442f22dd0e08..b9355a2c81d2c47dae19b0c6d252b08c3d428b43 100644 (file)
--- a/youtube_dlc/extractor/acast.py
+++ b/youtube_dlc/extractor/acast.py
@@ -6,6 +6,7 @@
  from .common import InfoExtractor
  from ..utils import (
      clean_html,
+    clean_podcast_url,
      int_or_none,
      parse_iso8601,
  )
@@ -17,7 +18,7 @@ def _extract_episode(self, episode, show_info):
          info = {
              'id': episode['id'],
              'display_id': episode.get('episodeUrl'),
-            'url': episode['url'],
+            'url': clean_podcast_url(episode['url']),
              'title': title,
              'description': clean_html(episode.get('description') or episode.get('summary')),
              'thumbnail': episode.get('image'),
diff --git a/youtube_dlc/extractor/applepodcasts.py b/youtube_dlc/extractor/applepodcasts.py

new file mode 100644 (file)

index 0000000..95758fe
--- /dev/null
+++ b/youtube_dlc/extractor/applepodcasts.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+)
+
+
+class ApplePodcastsIE(InfoExtractor):
+    _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+        'md5': 'df02e6acb11c10e844946a39e7222b08',
+        'info_dict': {
+            'id': '1000482637777',
+            'ext': 'mp3',
+            'title': '207 - Whitney Webb Returns',
+            'description': 'md5:13a73bade02d2e43737751e3987e1399',
+            'upload_date': '20200705',
+            'timestamp': 1593921600,
+            'duration': 6425,
+            'series': 'The Tim Dillon Show',
+        }
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+        'only_matching': True,
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
+        'only_matching': True,
+    }, {
+        'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        webpage = self._download_webpage(url, episode_id)
+        ember_data = self._parse_json(self._search_regex(
+            r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
+            webpage, 'ember data'), episode_id)
+        episode = ember_data['data']['attributes']
+        description = episode.get('description') or {}
+
+        series = None
+        for inc in (ember_data.get('included') or []):
+            if inc.get('type') == 'media/podcast':
+                series = try_get(inc, lambda x: x['attributes']['name'])
+
+        return {
+            'id': episode_id,
+            'title': episode['name'],
+            'url': clean_podcast_url(episode['assetUrl']),
+            'description': description.get('standard') or description.get('short'),
+            'timestamp': parse_iso8601(episode.get('releaseDateTime')),
+            'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
+            'series': series,
+        }
diff --git a/youtube_dlc/extractor/bfmtv.py b/youtube_dlc/extractor/bfmtv.py

new file mode 100644 (file)

index 0000000..501f69d
--- /dev/null
+++ b/youtube_dlc/extractor/bfmtv.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import extract_attributes
+
+
+class BFMTVBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/'
+    _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
+    _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)'
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+
+    def _brightcove_url_result(self, video_id, video_block):
+        account_id = video_block.get('accountid') or '876450612001'
+        player_id = video_block.get('playerid') or 'I2qBTln4u'
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
+            'BrightcoveNew', video_id)
+
+
+class BFMTVIE(BFMTVBaseIE):
+    IE_NAME = 'bfmtv'
+    _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V'
+    _TESTS = [{
+        'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html',
+        'info_dict': {
+            'id': '6196747868001',
+            'ext': 'mp4',
+            'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourd’hui, partout dans le monde"',
+            'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.',
+            'uploader_id': '876450610001',
+            'upload_date': '20201002',
+            'timestamp': 1601629620,
+        },
+    }]
+
+    def _real_extract(self, url):
+        bfmtv_id = self._match_id(url)
+        webpage = self._download_webpage(url, bfmtv_id)
+        video_block = extract_attributes(self._search_regex(
+            self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
+        return self._brightcove_url_result(video_block['videoid'], video_block)
+
+
+class BFMTVLiveIE(BFMTVIE):
+    IE_NAME = 'bfmtv:live'
+    _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
+    _TESTS = [{
+        'url': 'https://www.bfmtv.com/en-direct/',
+        'info_dict': {
+            'id': '5615950982001',
+            'ext': 'mp4',
+            'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'uploader_id': '876450610001',
+            'upload_date': '20171018',
+            'timestamp': 1508329950,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.bfmtv.com/economie/en-direct/',
+        'only_matching': True,
+    }]
+
+
+class BFMTVArticleIE(BFMTVBaseIE):
+    IE_NAME = 'bfmtv:article'
+    _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A'
+    _TESTS = [{
+        'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html',
+        'info_dict': {
+            'id': '202101060198',
+            'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"',
+            'description': 'md5:947974089c303d3ac6196670ae262843',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        bfmtv_id = self._match_id(url)
+        webpage = self._download_webpage(url, bfmtv_id)
+
+        entries = []
+        for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
+            video_block = extract_attributes(video_block_el)
+            video_id = video_block.get('videoid')
+            if not video_id:
+                continue
+            entries.append(self._brightcove_url_result(video_id, video_block))
+
+        return self.playlist_result(
+            entries, bfmtv_id, self._og_search_title(webpage, fatal=False),
+            self._html_search_meta(['og:description', 'description'], webpage))
diff --git a/youtube_dlc/extractor/bibeltv.py b/youtube_dlc/extractor/bibeltv.py

new file mode 100644 (file)

index 0000000..56c2bfe
--- /dev/null
+++ b/youtube_dlc/extractor/bibeltv.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BibelTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch',
+        'md5': '252f908192d611de038b8504b08bf97f',
+        'info_dict': {
+            'id': 'ref:329703',
+            'ext': 'mp4',
+            'title': 'Sprachkurs in Malaiisch',
+            'description': 'md5:3e9f197d29ee164714e67351cf737dfe',
+            'timestamp': 1608316701,
+            'uploader_id': '5840105145001',
+            'upload_date': '20201218',
+        }
+    }, {
+        'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s'
+
+    def _real_extract(self, url):
+        crn_id = self._match_id(url)
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew')
diff --git a/youtube_dlc/extractor/canvas.py b/youtube_dlc/extractor/canvas.py

index 8667a0d0457cccfc145cc52bc1eb1c7816aa04b8..8b76a0200ca51a350052d7bcc2b4beff3fee2448 100644 (file)
--- a/youtube_dlc/extractor/canvas.py
+++ b/youtube_dlc/extractor/canvas.py
@@ -7,12 +7,12 @@
  from .gigya import GigyaBaseIE
  from ..compat import compat_HTTPError
  from ..utils import (
+    extract_attributes,
      ExtractorError,
      strip_or_none,
      float_or_none,
      int_or_none,
      merge_dicts,
-    parse_iso8601,
      str_or_none,
      url_or_none,
  )
@@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor):
          'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
          'only_matching': True,
      }]
+    _GEO_BYPASS = False
      _HLS_ENTRY_PROTOCOLS_MAP = {
          'HLS': 'm3u8_native',
          'HLS_AES': 'm3u8',
@@ -47,29 +48,34 @@ def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          site_id, video_id = mobj.group('site_id'), mobj.group('id')
  
-        # Old API endpoint, serves more formats but may fail for some videos
-        data = self._download_json(
-            'https://mediazone.vrt.be/api/v1/%s/assets/%s'
-            % (site_id, video_id), video_id, 'Downloading asset JSON',
-            'Unable to download asset JSON', fatal=False)
+        data = None
+        if site_id != 'vrtvideo':
+            # Old API endpoint, serves more formats but may fail for some videos
+            data = self._download_json(
+                'https://mediazone.vrt.be/api/v1/%s/assets/%s'
+                % (site_id, video_id), video_id, 'Downloading asset JSON',
+                'Unable to download asset JSON', fatal=False)
  
          # New API endpoint
          if not data:
+            headers = self.geo_verification_headers()
+            headers.update({'Content-Type': 'application/json'})
              token = self._download_json(
                  '%s/tokens' % self._REST_API_BASE, video_id,
-                'Downloading token', data=b'',
-                headers={'Content-Type': 'application/json'})['vrtPlayerToken']
+                'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
              data = self._download_json(
                  '%s/videos/%s' % (self._REST_API_BASE, video_id),
-                video_id, 'Downloading video JSON', fatal=False, query={
+                video_id, 'Downloading video JSON', query={
                      'vrtPlayerToken': token,
                      'client': '%s@PROD' % site_id,
                  }, expected_status=400)
-            message = data.get('message')
-            if message and not data.get('title'):
-                if data.get('code') == 'AUTHENTICATION_REQUIRED':
-                    self.raise_login_required(message)
-                raise ExtractorError(message, expected=True)
+            if not data.get('title'):
+                code = data.get('code')
+                if code == 'AUTHENTICATION_REQUIRED':
+                    self.raise_login_required()
+                elif code == 'INVALID_LOCATION':
+                    self.raise_geo_restricted(countries=['BE'])
+                raise ExtractorError(data.get('message') or code, expected=True)
  
          title = data['title']
          description = data.get('description')
@@ -205,20 +211,24 @@ def _real_extract(self, url):
  
  class VrtNUIE(GigyaBaseIE):
      IE_DESC = 'VrtNU.be'
-    _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
      _TESTS = [{
          # Available via old API endpoint
-        'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
+        'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
          'info_dict': {
-            'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
+            'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
              'ext': 'mp4',
-            'title': 'De zwarte weduwe',
-            'description': 'md5:db1227b0f318c849ba5eab1fef895ee4',
+            'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
+            'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
              'duration': 1457.04,
              'thumbnail': r're:^https?://.*\.jpg$',
-            'season': 'Season 1',
-            'season_number': 1,
+            'series': 'Postbus X',
+            'season': 'Seizoen 1989',
+            'season_number': 1989,
+            'episode': 'De zwarte weduwe',
              'episode_number': 1,
+            'timestamp': 1595822400,
+            'upload_date': '20200727',
          },
          'skip': 'This video is only available for registered users',
          'params': {
@@ -300,69 +310,25 @@ def _login(self):
      def _real_extract(self, url):
          display_id = self._match_id(url)
  
-        webpage, urlh = self._download_webpage_handle(url, display_id)
-
-        info = self._search_json_ld(webpage, display_id, default={})
-
-        # title is optional here since it may be extracted by extractor
-        # that is delegated from here
-        title = strip_or_none(self._html_search_regex(
-            r'(?ms)<h1 class="content__heading">(.+?)</h1>',
-            webpage, 'title', default=None))
-
-        description = self._html_search_regex(
-            r'(?ms)<div class="content__description">(.+?)</div>',
-            webpage, 'description', default=None)
-
-        season = self._html_search_regex(
-            [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
-                    <span>seizoen\ (.+?)</span>\s*
-                </div>''',
-             r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
-            webpage, 'season', default=None)
-
-        season_number = int_or_none(season)
-
-        episode_number = int_or_none(self._html_search_regex(
-            r'''(?xms)<div\ class="content__episode">\s*
-                    <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
-                </div>''',
-            webpage, 'episode_number', default=None))
-
-        release_date = parse_iso8601(self._html_search_regex(
-            r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
-            webpage, 'release_date', default=None))
-
-        # If there's a ? or a # in the URL, remove them and everything after
-        clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
-        securevideo_url = clean_url + '.mssecurevideo.json'
-
-        try:
-            video = self._download_json(securevideo_url, display_id)
-        except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
-                self.raise_login_required()
-            raise
+        webpage = self._download_webpage(url, display_id)
  
-        # We are dealing with a '../<show>.relevant' URL
-        redirect_url = video.get('url')
-        if redirect_url:
-            return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
+        attrs = extract_attributes(self._search_regex(
+            r'(<nui-media[^>]+>)', webpage, 'media element'))
+        video_id = attrs['videoid']
+        publication_id = attrs.get('publicationid')
+        if publication_id:
+            video_id = publication_id + '$' + video_id
  
-        # There is only one entry, but with an unknown key, so just get
-        # the first one
-        video_id = list(video.values())[0].get('videoid')
+        page = (self._parse_json(self._search_regex(
+            r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
+            default='{}'), video_id, fatal=False) or {}).get('page') or {}
  
+        info = self._search_json_ld(webpage, display_id, default={})
          return merge_dicts(info, {
              '_type': 'url_transparent',
              'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
              'ie_key': CanvasIE.ie_key(),
              'id': video_id,
              'display_id': display_id,
-            'title': title,
-            'description': description,
-            'season': season,
-            'season_number': season_number,
-            'episode_number': episode_number,
-            'release_date': release_date,
+            'season_number': int_or_none(page.get('episode_season')),
          })
diff --git a/youtube_dlc/extractor/dplay.py b/youtube_dlc/extractor/dplay.py

index a7b9db5689c4df697f1606a343995ef3d29ee03a..47501dbe6140ea8f30d0ce532bfc3bd321f35557 100644 (file)
--- a/youtube_dlc/extractor/dplay.py
+++ b/youtube_dlc/extractor/dplay.py
@@ -17,7 +17,12 @@
  class DPlayIE(InfoExtractor):
      _VALID_URL = r'''(?x)https?://
          (?P<domain>
-            (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))|
+            (?:www\.)?(?P<host>d
+                (?:
+                    play\.(?P<country>dk|fi|jp|se|no)|
+                    iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no)
+                )
+            )|
              (?P<subdomain_country>es|it)\.dplay\.com
          )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
  
@@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor):
      }, {
          'url': 'https://www.dplay.jp/video/gold-rush/24086',
          'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16',
+        'only_matching': True,
      }]
  
      def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
@@ -241,7 +264,7 @@ def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          display_id = mobj.group('id')
          domain = mobj.group('domain').lstrip('www.')
-        country = mobj.group('country') or mobj.group('subdomain_country')
-        host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com'
+        country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country')
+        host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
          return self._get_disco_api_info(
              url, display_id, host, 'dplay' + country, country)
diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py

index 200cf13953e642d6eee9d0f8ec5d7694a9e7e6ec..65effed8e6ac8172aa62436f39306ae31becc0c8 100644 (file)
--- a/youtube_dlc/extractor/extractors.py
+++ b/youtube_dlc/extractor/extractors.py
@@ -59,6 +59,7 @@
      AppleTrailersIE,
      AppleTrailersSectionIE,
  )
+from .applepodcasts import ApplePodcastsIE
  from .archiveorg import ArchiveOrgIE
  from .arcpublishing import ArcPublishingIE
  from .arkena import ArkenaIE
@@ -104,6 +105,12 @@
  from .beatport import BeatportIE
  from .bet import BetIE
  from .bfi import BFIPlayerIE
+from .bfmtv import (
+    BFMTVIE,
+    BFMTVLiveIE,
+    BFMTVArticleIE,
+)
+from .bibeltv import BibelTVIE
  from .bigflix import BigflixIE
  from .bild import BildIE
  from .bilibili import (
@@ -442,7 +449,10 @@
  from .godtube import GodTubeIE
  from .golem import GolemIE
  from .googledrive import GoogleDriveIE
-from .googleplus import GooglePlusIE
+from .googlepodcasts import (
+    GooglePodcastsIE,
+    GooglePodcastsFeedIE,
+)
  from .googlesearch import GoogleSearchIE
  from .goshgay import GoshgayIE
  from .gputechconf import GPUTechConfIE
@@ -484,6 +494,10 @@
      OneUPIE,
      PCMagIE,
  )
+from .iheart import (
+    IHeartRadioIE,
+    IHeartRadioPodcastIE,
+)
  from .imdb import (
      ImdbIE,
      ImdbListIE
diff --git a/youtube_dlc/extractor/googleplus.py b/youtube_dlc/extractor/googleplus.py

deleted file mode 100644 (file)

index 6b927bb..0000000
--- a/youtube_dlc/extractor/googleplus.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import codecs
-
-from .common import InfoExtractor
-from ..utils import unified_strdate
-
-
-class GooglePlusIE(InfoExtractor):
-    IE_DESC = 'Google Plus'
-    _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
-    IE_NAME = 'plus.google'
-    _TEST = {
-        'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
-        'info_dict': {
-            'id': 'ZButuJc6CtH',
-            'ext': 'flv',
-            'title': '嘆きの天使 降臨',
-            'upload_date': '20120613',
-            'uploader': '井上ヨシマサ',
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        # Step 1, Retrieve post webpage to extract further information
-        webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
-
-        title = self._og_search_description(webpage).splitlines()[0]
-        upload_date = unified_strdate(self._html_search_regex(
-            r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
-                    ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
-            webpage, 'upload date', fatal=False, flags=re.VERBOSE))
-        uploader = self._html_search_regex(
-            r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
-
-        # Step 2, Simulate clicking the image box to launch video
-        DOMAIN = 'https://plus.google.com/'
-        video_page = self._search_regex(
-            r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
-            webpage, 'video page URL')
-        if not video_page.startswith(DOMAIN):
-            video_page = DOMAIN + video_page
-
-        webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
-
-        def unicode_escape(s):
-            decoder = codecs.getdecoder('unicode_escape')
-            return re.sub(
-                r'\\u[0-9a-fA-F]{4,}',
-                lambda m: decoder(m.group(0))[0],
-                s)
-
-        # Extract video links all sizes
-        formats = [{
-            'url': unicode_escape(video_url),
-            'ext': 'flv',
-            'width': int(width),
-            'height': int(height),
-        } for width, height, video_url in re.findall(
-            r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)]
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'uploader': uploader,
-            'upload_date': upload_date,
-            'formats': formats,
-        }
diff --git a/youtube_dlc/extractor/googlepodcasts.py b/youtube_dlc/extractor/googlepodcasts.py

new file mode 100644 (file)

index 0000000..31ad799
--- /dev/null
+++ b/youtube_dlc/extractor/googlepodcasts.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class GooglePodcastsBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
+
+    def _batch_execute(self, func_id, video_id, params):
+        return json.loads(self._download_json(
+            'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
+            video_id, data=urlencode_postdata({
+                'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
+            }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
+
+    def _extract_episode(self, episode):
+        return {
+            'id': episode[4][3],
+            'title': episode[8],
+            'url': clean_podcast_url(episode[13]),
+            'thumbnail': episode[2],
+            'description': episode[9],
+            'creator': try_get(episode, lambda x: x[14]),
+            'timestamp': int_or_none(episode[11]),
+            'duration': int_or_none(episode[12]),
+            'series': episode[1],
+        }
+
+
+class GooglePodcastsIE(GooglePodcastsBaseIE):
+    IE_NAME = 'google:podcasts'
+    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
+        'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
+        'info_dict': {
+            'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
+            'ext': 'mp3',
+            'title': 'WWDTM New Year 2021',
+            'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
+            'upload_date': '20210102',
+            'timestamp': 1609606800,
+            'duration': 2901,
+            'series': "Wait Wait... Don't Tell Me!",
+        }
+    }
+
+    def _real_extract(self, url):
+        b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
+        episode = self._batch_execute(
+            'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
+        return self._extract_episode(episode)
+
+
+class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
+    IE_NAME = 'google:podcasts:feed'
+    _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
+    _TEST = {
+        'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
+        'info_dict': {
+            'title': "Wait Wait... Don't Tell Me!",
+            'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
+        },
+        'playlist_mincount': 20,
+    }
+
+    def _real_extract(self, url):
+        b64_feed_url = self._match_id(url)
+        data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
+
+        entries = []
+        for episode in (try_get(data, lambda x: x[1][0]) or []):
+            entries.append(self._extract_episode(episode))
+
+        feed = try_get(data, lambda x: x[3]) or []
+        return self.playlist_result(
+            entries, playlist_title=try_get(feed, lambda x: x[0]),
+            playlist_description=try_get(feed, lambda x: x[2]))
diff --git a/youtube_dlc/extractor/iheart.py b/youtube_dlc/extractor/iheart.py

new file mode 100644 (file)

index 0000000..b54c05e
--- /dev/null
+++ b/youtube_dlc/extractor/iheart.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    clean_podcast_url,
+    int_or_none,
+    str_or_none,
+)
+
+
+class IHeartRadioBaseIE(InfoExtractor):
+    def _call_api(self, path, video_id, fatal=True, query=None):
+        return self._download_json(
+            'https://api.iheart.com/api/v3/podcast/' + path,
+            video_id, fatal=fatal, query=query)
+
+    def _extract_episode(self, episode):
+        return {
+            'thumbnail': episode.get('imageUrl'),
+            'description': clean_html(episode.get('description')),
+            'timestamp': int_or_none(episode.get('startDate'), 1000),
+            'duration': int_or_none(episode.get('duration')),
+        }
+
+
+class IHeartRadioIE(IHeartRadioBaseIE):
+    IENAME = 'iheartradio'
+    _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
+        'md5': 'c8609c92c8688dcb69d8541042b8abca',
+        'info_dict': {
+            'id': '70346499',
+            'ext': 'mp3',
+            'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
+            'description': 'md5:96cc7297b3a5a9ebae28643801c96fae',
+            'timestamp': 1597741200,
+            'upload_date': '20200818',
+        }
+    }
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        episode = self._call_api(
+            'episodes/' + episode_id, episode_id)['episode']
+        info = self._extract_episode(episode)
+        info.update({
+            'id': episode_id,
+            'title': episode['title'],
+            'url': clean_podcast_url(episode['mediaUrl']),
+        })
+        return info
+
+
+class IHeartRadioPodcastIE(IHeartRadioBaseIE):
+    IE_NAME = 'iheartradio:podcast'
+    _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P<id>\d+)/?(?:[?#&]|$)'
+    _TESTS = [{
+        'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
+        'info_dict': {
+            'id': '30717896',
+            'title': 'It Could Happen Here',
+            'description': 'md5:5842117412a967eb0b01f8088eb663e2',
+        },
+        'playlist_mincount': 11,
+    }, {
+        'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        podcast_id = self._match_id(url)
+        path = 'podcasts/' + podcast_id
+        episodes = self._call_api(
+            path + '/episodes', podcast_id, query={'limit': 1000000000})['data']
+
+        entries = []
+        for episode in episodes:
+            episode_id = str_or_none(episode.get('id'))
+            if not episode_id:
+                continue
+            info = self._extract_episode(episode)
+            info.update({
+                '_type': 'url',
+                'id': episode_id,
+                'title': episode.get('title'),
+                'url': 'iheartradio:' + episode_id,
+                'ie_key': IHeartRadioIE.ie_key(),
+            })
+            entries.append(info)
+
+        podcast = self._call_api(path, podcast_id, False) or {}
+
+        return self.playlist_result(
+            entries, podcast_id, podcast.get('title'), podcast.get('description'))
diff --git a/youtube_dlc/extractor/ketnet.py b/youtube_dlc/extractor/ketnet.py

index 93a98e1e08beff701594d4e7d763c3b7b5790ab1..e0599d02fabf30c12a939064d063e8e7f4ba83f8 100644 (file)
--- a/youtube_dlc/extractor/ketnet.py
+++ b/youtube_dlc/extractor/ketnet.py
@@ -2,92 +2,71 @@
  
  from .canvas import CanvasIE
  from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
  
  
  class KetnetIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)'
      _TESTS = [{
-        'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes',
-        'md5': '6bdeb65998930251bbd1c510750edba9',
+        'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook',
+        'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
          'info_dict': {
-            'id': 'zomerse-filmpjes',
+            'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd',
              'ext': 'mp4',
-            'title': 'Gluur mee op de filmset en op Pennenzakkenrock',
-            'description': 'Gluur mee met Ghost Rockers op de filmset',
+            'title': 'Nachtwacht - Reeks 3: Aflevering 1',
+            'description': 'De Nachtwacht krijgt te maken met een parasiet',
              'thumbnail': r're:^https?://.*\.jpg$',
-        }
-    }, {
-        # mzid in playerConfig instead of sources
-        'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook',
-        'md5': '90139b746a0a9bd7bb631283f6e2a64e',
-        'info_dict': {
-            'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
-            'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
-            'ext': 'flv',
-            'title': 'Nachtwacht: De Greystook',
-            'description': 'md5:1db3f5dc4c7109c821261e7512975be7',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'duration': 1468.03,
+            'duration': 1468.02,
+            'timestamp': 1609225200,
+            'upload_date': '20201229',
+            'series': 'Nachtwacht',
+            'season': 'Reeks 3',
+            'episode': 'De Greystook',
+            'episode_number': 1,
          },
          'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
      }, {
-        'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016',
-        'only_matching': True,
-    }, {
-        'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life',
-        'only_matching': True,
-    }, {
-        # mzsource, geo restricted to Belgium
-        'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe',
+        'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba',
          'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        config = self._parse_json(
-            self._search_regex(
-                r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage,
-                'player config'),
-            video_id)
-
-        mzid = config.get('mzid')
-        if mzid:
-            return self.url_result(
-                'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid,
-                CanvasIE.ie_key(), video_id=mzid)
+        display_id = self._match_id(url)
  
-        title = config['title']
+        video = self._download_json(
+            'https://senior-bff.ketnet.be/graphql', display_id, query={
+                'query': '''{
+  video(id: "content/ketnet/nl/%s.model.json") {
+    description
+    episodeNr
+    imageUrl
+    mediaReference
+    programTitle
+    publicationDate
+    seasonTitle
+    subtitleVideodetail
+    titleVideodetail
+  }
+}''' % display_id,
+            })['data']['video']
  
-        formats = []
-        for source_key in ('', 'mz'):
-            source = config.get('%ssource' % source_key)
-            if not isinstance(source, dict):
-                continue
-            for format_id, format_url in source.items():
-                if format_id == 'hls':
-                    formats.extend(self._extract_m3u8_formats(
-                        format_url, video_id, 'mp4',
-                        entry_protocol='m3u8_native', m3u8_id=format_id,
-                        fatal=False))
-                elif format_id == 'hds':
-                    formats.extend(self._extract_f4m_formats(
-                        format_url, video_id, f4m_id=format_id, fatal=False))
-                else:
-                    formats.append({
-                        'url': format_url,
-                        'format_id': format_id,
-                    })
-        self._sort_formats(formats)
+        mz_id = compat_urllib_parse_unquote(video['mediaReference'])
  
          return {
-            'id': video_id,
-            'title': title,
-            'description': config.get('description'),
-            'thumbnail': config.get('image'),
-            'series': config.get('program'),
-            'episode': config.get('episode'),
-            'formats': formats,
+            '_type': 'url_transparent',
+            'id': mz_id,
+            'title': video['titleVideodetail'],
+            'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id,
+            'thumbnail': video.get('imageUrl'),
+            'description': video.get('description'),
+            'timestamp': parse_iso8601(video.get('publicationDate')),
+            'series': video.get('programTitle'),
+            'season': video.get('seasonTitle'),
+            'episode': video.get('subtitleVideodetail'),
+            'episode_number': int_or_none(video.get('episodeNr')),
+            'ie_key': CanvasIE.ie_key(),
          }
diff --git a/youtube_dlc/extractor/motherless.py b/youtube_dlc/extractor/motherless.py

index b1615b4d8e4bce8b580942f717477e6ed57ee92e..ef1e081f20e7359139dc209275f29c162311d9dc 100644 (file)
--- a/youtube_dlc/extractor/motherless.py
+++ b/youtube_dlc/extractor/motherless.py
@@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor):
          # no keywords
          'url': 'http://motherless.com/8B4BBC1',
          'only_matching': True,
+    }, {
+        # see https://motherless.com/videos/recent for recent videos with
+        # uploaded date in "ago" format
+        'url': 'https://motherless.com/3C3E2CF',
+        'info_dict': {
+            'id': '3C3E2CF',
+            'ext': 'mp4',
+            'title': 'a/ Hot Teens',
+            'categories': list,
+            'upload_date': '20210104',
+            'uploader_id': 'yonbiw',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
      }]
  
      def _real_extract(self, url):
@@ -85,20 +102,28 @@ def _real_extract(self, url):
              or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
          age_limit = self._rta_search(webpage)
          view_count = str_to_int(self._html_search_regex(
-            (r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
+            (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
              webpage, 'view count', fatal=False))
          like_count = str_to_int(self._html_search_regex(
-            (r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'),
+            (r'>([\d,.]+)\s+Favorites<',
+             r'<strong>Favorited</strong>\s+([^<]+)<'),
              webpage, 'like count', fatal=False))
  
-        upload_date = self._html_search_regex(
-            (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
-             r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
-        if 'Ago' in upload_date:
-            days = int(re.search(r'([0-9]+)', upload_date).group(1))
-            upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
-        else:
-            upload_date = unified_strdate(upload_date)
+        upload_date = unified_strdate(self._search_regex(
+            r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
+            'upload date', default=None))
+        if not upload_date:
+            uploaded_ago = self._search_regex(
+                r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
+                default=None)
+            if uploaded_ago:
+                delta = int(uploaded_ago[:-1])
+                _AGO_UNITS = {
+                    'h': 'hours',
+                    'd': 'days',
+                }
+                kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
+                upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
  
          comment_count = webpage.count('class="media-comment-contents"')
          uploader_id = self._html_search_regex(
diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py

index 69178e1579c5de93e128c4419cc2ff7f95270854..40dee2162f5867ca3bc94ca56508329c7905d0b1 100644 (file)
--- a/youtube_dlc/extractor/nrk.py
+++ b/youtube_dlc/extractor/nrk.py
@@ -223,12 +223,12 @@ def call_playback_api(item, query=None):
          legal_age = try_get(
              data, lambda x: x['legalAge']['body']['rating']['code'], compat_str)
          # https://en.wikipedia.org/wiki/Norwegian_Media_Authority
-        if legal_age == 'A':
-            age_limit = 0
-        elif legal_age.isdigit():
-            age_limit = int_or_none(legal_age)
-        else:
-            age_limit = None
+        age_limit = None
+        if legal_age:
+            if legal_age == 'A':
+                age_limit = 0
+            elif legal_age.isdigit():
+                age_limit = int_or_none(legal_age)
  
          is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series'
  
@@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor):
              'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
              'duration': 2223.44,
              'age_limit': 6,
+            'subtitles': {
+                'nb-nor': [{
+                    'ext': 'vtt',
+                }],
+                'nb-ttv': [{
+                    'ext': 'vtt',
+                }]
+            },
          },
      }, {
          'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py

index 5eef7c633a5f8927a65cd387c08bddfa181f7aef..c78580d957b17299961fc0a972dd4f8476839b0a 100644 (file)
--- a/youtube_dlc/extractor/rai.py
+++ b/youtube_dlc/extractor/rai.py
@@ -103,22 +103,28 @@ def _extract_relinker_info(self, relinker_url, video_id):
          }.items() if v is not None)
  
      @staticmethod
-    def _extract_subtitles(url, subtitle_url):
+    def _extract_subtitles(url, video_data):
+        STL_EXT = 'stl'
+        SRT_EXT = 'srt'
          subtitles = {}
-        if subtitle_url and isinstance(subtitle_url, compat_str):
-            subtitle_url = urljoin(url, subtitle_url)
-            STL_EXT = '.stl'
-            SRT_EXT = '.srt'
-            subtitles['it'] = [{
-                'ext': 'stl',
-                'url': subtitle_url,
-            }]
-            if subtitle_url.endswith(STL_EXT):
-                srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
-                subtitles['it'].append({
-                    'ext': 'srt',
-                    'url': srt_url,
+        subtitles_array = video_data.get('subtitlesArray') or []
+        for k in ('subtitles', 'subtitlesUrl'):
+            subtitles_array.append({'url': video_data.get(k)})
+        for subtitle in subtitles_array:
+            sub_url = subtitle.get('url')
+            if sub_url and isinstance(sub_url, compat_str):
+                sub_lang = subtitle.get('language') or 'it'
+                sub_url = urljoin(url, sub_url)
+                sub_ext = determine_ext(sub_url, SRT_EXT)
+                subtitles.setdefault(sub_lang, []).append({
+                    'ext': sub_ext,
+                    'url': sub_url,
                  })
+                if STL_EXT == sub_ext:
+                    subtitles[sub_lang].append({
+                        'ext': SRT_EXT,
+                        'url': sub_url[:-len(STL_EXT)] + SRT_EXT,
+                    })
          return subtitles
  
  
@@ -138,6 +144,9 @@ class RaiPlayIE(RaiBaseIE):
              'duration': 6160,
              'series': 'Report',
              'season': '2013/14',
+            'subtitles': {
+                'it': 'count:2',
+            },
          },
          'params': {
              'skip_download': True,
@@ -145,6 +154,10 @@ class RaiPlayIE(RaiBaseIE):
      }, {
          'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
          'only_matching': True,
+    }, {
+        # subtitles at 'subtitlesArray' key (see #27698)
+        'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
@@ -171,7 +184,7 @@ def _real_extract(self, url):
          if date_published and time_published:
              date_published += ' ' + time_published
  
-        subtitles = self._extract_subtitles(url, video.get('subtitles'))
+        subtitles = self._extract_subtitles(url, video)
  
          program_info = media.get('program_info') or {}
          season = media.get('season')
@@ -325,6 +338,22 @@ class RaiIE(RaiBaseIE):
          'params': {
              'skip_download': True,
          },
+    }, {
+        # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key
+        'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html',
+        'info_dict': {
+            'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd',
+            'ext': 'mp4',
+            'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015',
+            'description': 'md5:d291b03407ec505f95f27970c0b025f4',
+            'upload_date': '20150913',
+            'subtitles': {
+                'it': 'count:2',
+            },
+        },
+        'params': {
+            'skip_download': True,
+        },
      }, {
          # Direct MMS URL
          'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
@@ -365,7 +394,7 @@ def _extract_from_content_id(self, content_id, url):
                      'url': compat_urlparse.urljoin(url, thumbnail_url),
                  })
  
-        subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
+        subtitles = self._extract_subtitles(url, media)
  
          info = {
              'id': content_id,
@@ -402,7 +431,8 @@ def _real_extract(self, url):
                  r'''(?x)
                      (?:
                          (?:initEdizione|drawMediaRaiTV)\(|
-                        <(?:[^>]+\bdata-id|var\s+uniquename)=
+                        <(?:[^>]+\bdata-id|var\s+uniquename)=|
+                        <iframe[^>]+\bsrc=
                      )
                      (["\'])
                      (?:(?!\1).)*\bContentItem-(?P<id>%s)
diff --git a/youtube_dlc/extractor/sbs.py b/youtube_dlc/extractor/sbs.py

index 0e623ff7b7039e541754088d606c33855ca5a01b..f722528cdceebb50fb10ce727ffbd969111d6c3e 100644 (file)
--- a/youtube_dlc/extractor/sbs.py
+++ b/youtube_dlc/extractor/sbs.py
@@ -10,7 +10,7 @@
  
  class SBSIE(InfoExtractor):
      IE_DESC = 'sbs.com.au'
-    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
  
      _TESTS = [{
          # Original URL is handled by the generic IE which finds the iframe:
@@ -18,7 +18,7 @@ class SBSIE(InfoExtractor):
          'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
          'md5': '3150cf278965eeabb5b4cea1c963fe0a',
          'info_dict': {
-            'id': '320403011771',
+            'id': '_rFBPRPO4pMR',
              'ext': 'mp4',
              'title': 'Dingo Conservation (The Feed)',
              'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
@@ -34,6 +34,15 @@ class SBSIE(InfoExtractor):
      }, {
          'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
          'only_matching': True,
+    }, {
+        'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/stv.py b/youtube_dlc/extractor/stv.py

index bae8b71f4ba8247696f81cec2315a4beb99cd891..539220a944be5d65c3b15fcb1538eb411f5ea5a1 100644 (file)
--- a/youtube_dlc/extractor/stv.py
+++ b/youtube_dlc/extractor/stv.py
@@ -8,13 +8,17 @@
      compat_str,
      float_or_none,
      int_or_none,
+    smuggle_url,
+    str_or_none,
+    try_get,
  )
  
  
  class STVPlayerIE(InfoExtractor):
      IE_NAME = 'stv:player'
      _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})'
-    _TEST = {
+    _TESTS = [{
+        # shortform
          'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/',
          'md5': '5adf9439c31d554f8be0707c7abe7e0a',
          'info_dict': {
@@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor):
              'uploader_id': '1486976045',
          },
          'skip': 'this resource is unavailable outside of the UK',
-    }
+    }, {
+        # episodes
+        'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane',
+        'only_matching': True,
+    }]
      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s'
      _PTYPE_MAP = {
          'episode': 'episodes',
@@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor):
  
      def _real_extract(self, url):
          ptype, video_id = re.match(self._VALID_URL, url).groups()
-        resp = self._download_json(
-            'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id),
-            video_id)
  
-        result = resp['results']
+        webpage = self._download_webpage(url, video_id, fatal=False) or ''
+        props = (self._parse_json(self._search_regex(
+            r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
+            webpage, 'next data', default='{}'), video_id,
+            fatal=False) or {}).get('props') or {}
+        player_api_cache = try_get(
+            props, lambda x: x['initialReduxState']['playerApiCache']) or {}
+
+        api_path, resp = None, {}
+        for k, v in player_api_cache.items():
+            if k.startswith('/episodes/') or k.startswith('/shortform/'):
+                api_path, resp = k, v
+                break
+        else:
+            episode_id = str_or_none(try_get(
+                props, lambda x: x['pageProps']['episodeId']))
+            api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id)
+
+        result = resp.get('results')
+        if not result:
+            resp = self._download_json(
+                'https://player.api.stv.tv/v1' + api_path, video_id)
+            result = resp['results']
+
          video = result['video']
          video_id = compat_str(video['id'])
  
@@ -57,7 +85,7 @@ def _real_extract(self, url):
          return {
              '_type': 'url_transparent',
              'id': video_id,
-            'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id,
+            'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}),
              'description': result.get('summary'),
              'duration': float_or_none(video.get('length'), 1000),
              'subtitles': subtitles,
diff --git a/youtube_dlc/extractor/twitch.py b/youtube_dlc/extractor/twitch.py

index ab131a07d1d4d1cbda7cf4a1c7ed7c61f58dd9a5..503d019de185c4f4dcd2f24f75c6467a6f7baa48 100644 (file)
--- a/youtube_dlc/extractor/twitch.py
+++ b/youtube_dlc/extractor/twitch.py
@@ -9,7 +9,6 @@
  
  from .common import InfoExtractor
  from ..compat import (
-    compat_kwargs,
      compat_parse_qs,
      compat_str,
      compat_urlparse,
@@ -42,30 +41,16 @@ class TwitchBaseIE(InfoExtractor):
      _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko'
      _NETRC_MACHINE = 'twitch'
  
-    def _handle_error(self, response):
-        if not isinstance(response, dict):
-            return
-        error = response.get('error')
-        if error:
-            raise ExtractorError(
-                '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
-                expected=True)
-
-    def _call_api(self, path, item_id, *args, **kwargs):
-        headers = kwargs.get('headers', {}).copy()
-        headers.update({
-            'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8',
-            'Client-ID': self._CLIENT_ID,
-        })
-        kwargs.update({
-            'headers': headers,
-            'expected_status': (400, 410),
-        })
-        response = self._download_json(
-            '%s/%s' % (self._API_BASE, path), item_id,
-            *args, **compat_kwargs(kwargs))
-        self._handle_error(response)
-        return response
+    _OPERATION_HASHES = {
+        'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14',
+        'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb',
+        'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777',
+        'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
+        'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
+        'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
+        'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
+        'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687',
+    }
  
      def _real_initialize(self):
          self._login()
@@ -151,13 +136,46 @@ def _prefer_source(self, formats):
                      })
          self._sort_formats(formats)
  
-    def _download_access_token(self, channel_name):
-        return self._call_api(
-            'api/channels/%s/access_token' % channel_name, channel_name,
-            'Downloading access token JSON')
+    def _download_base_gql(self, video_id, ops, note, fatal=True):
+        return self._download_json(
+            'https://gql.twitch.tv/gql', video_id, note,
+            data=json.dumps(ops).encode(),
+            headers={
+                'Content-Type': 'text/plain;charset=UTF-8',
+                'Client-ID': self._CLIENT_ID,
+            }, fatal=fatal)
  
-    def _extract_channel_id(self, token, channel_name):
-        return compat_str(self._parse_json(token, channel_name)['channel_id'])
+    def _download_gql(self, video_id, ops, note, fatal=True):
+        for op in ops:
+            op['extensions'] = {
+                'persistedQuery': {
+                    'version': 1,
+                    'sha256Hash': self._OPERATION_HASHES[op['operationName']],
+                }
+            }
+        return self._download_base_gql(video_id, ops, note)
+
+    def _download_access_token(self, video_id, token_kind, param_name):
+        method = '%sPlaybackAccessToken' % token_kind
+        ops = {
+            'query': '''{
+              %s(
+                %s: "%s",
+                params: {
+                  platform: "web",
+                  playerBackend: "mediaplayer",
+                  playerType: "site"
+                }
+              )
+              {
+                value
+                signature
+              }
+            }''' % (method, param_name, video_id),
+        }
+        return self._download_base_gql(
+            video_id, ops,
+            'Downloading %s access token GraphQL' % token_kind)['data'][method]
  
  
  class TwitchVodIE(TwitchBaseIE):
@@ -170,8 +188,6 @@ class TwitchVodIE(TwitchBaseIE):
                          )
                          (?P<id>\d+)
                      '''
-    _ITEM_TYPE = 'vod'
-    _ITEM_SHORTCUT = 'v'
  
      _TESTS = [{
          'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
@@ -181,7 +197,7 @@ class TwitchVodIE(TwitchBaseIE):
              'title': 'LCK Summer Split - Week 6 Day 1',
              'thumbnail': r're:^https?://.*\.jpg$',
              'duration': 17208,
-            'timestamp': 1435131709,
+            'timestamp': 1435131734,
              'upload_date': '20150624',
              'uploader': 'Riot Games',
              'uploader_id': 'riotgames',
@@ -230,10 +246,20 @@ class TwitchVodIE(TwitchBaseIE):
      }]
  
      def _download_info(self, item_id):
-        return self._extract_info(
-            self._call_api(
-                'kraken/videos/%s' % item_id, item_id,
-                'Downloading video info JSON'))
+        data = self._download_gql(
+            item_id, [{
+                'operationName': 'VideoMetadata',
+                'variables': {
+                    'channelLogin': '',
+                    'videoID': item_id,
+                },
+            }],
+            'Downloading stream metadata GraphQL')[0]['data']
+        video = data.get('video')
+        if video is None:
+            raise ExtractorError(
+                'Video %s does not exist' % item_id, expected=True)
+        return self._extract_info_gql(video, item_id)
  
      @staticmethod
      def _extract_info(info):
@@ -272,13 +298,33 @@ def _extract_info(info):
              'is_live': is_live,
          }
  
+    @staticmethod
+    def _extract_info_gql(info, item_id):
+        vod_id = info.get('id') or item_id
+        # id backward compatibility for download archives
+        if vod_id[0] != 'v':
+            vod_id = 'v%s' % vod_id
+        thumbnail = url_or_none(info.get('previewThumbnailURL'))
+        if thumbnail:
+            for p in ('width', 'height'):
+                thumbnail = thumbnail.replace('{%s}' % p, '0')
+        return {
+            'id': vod_id,
+            'title': info.get('title') or 'Untitled Broadcast',
+            'description': info.get('description'),
+            'duration': int_or_none(info.get('lengthSeconds')),
+            'thumbnail': thumbnail,
+            'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str),
+            'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str),
+            'timestamp': unified_timestamp(info.get('publishedAt')),
+            'view_count': int_or_none(info.get('viewCount')),
+        }
+
      def _real_extract(self, url):
          vod_id = self._match_id(url)
  
          info = self._download_info(vod_id)
-        access_token = self._call_api(
-            'api/vods/%s/access_token' % vod_id, vod_id,
-            'Downloading %s access token' % self._ITEM_TYPE)
+        access_token = self._download_access_token(vod_id, 'video', 'id')
  
          formats = self._extract_m3u8_formats(
              '%s/vod/%s.m3u8?%s' % (
@@ -289,8 +335,8 @@ def _real_extract(self, url):
                      'allow_spectre': 'true',
                      'player': 'twitchweb',
                      'playlist_include_framerate': 'true',
-                    'nauth': access_token['token'],
-                    'nauthsig': access_token['sig'],
+                    'nauth': access_token['value'],
+                    'nauthsig': access_token['signature'],
                  })),
              vod_id, 'mp4', entry_protocol='m3u8_native')
  
@@ -333,37 +379,7 @@ def _make_video_result(node):
      }
  
  
-class TwitchGraphQLBaseIE(TwitchBaseIE):
-    _PAGE_LIMIT = 100
-
-    _OPERATION_HASHES = {
-        'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14',
-        'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb',
-        'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777',
-        'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
-        'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
-        'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
-        'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
-    }
-
-    def _download_gql(self, video_id, ops, note, fatal=True):
-        for op in ops:
-            op['extensions'] = {
-                'persistedQuery': {
-                    'version': 1,
-                    'sha256Hash': self._OPERATION_HASHES[op['operationName']],
-                }
-            }
-        return self._download_json(
-            'https://gql.twitch.tv/gql', video_id, note,
-            data=json.dumps(ops).encode(),
-            headers={
-                'Content-Type': 'text/plain;charset=UTF-8',
-                'Client-ID': self._CLIENT_ID,
-            }, fatal=fatal)
-
-
-class TwitchCollectionIE(TwitchGraphQLBaseIE):
+class TwitchCollectionIE(TwitchBaseIE):
      _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)'
  
      _TESTS = [{
@@ -400,7 +416,9 @@ def _real_extract(self, url):
              entries, playlist_id=collection_id, playlist_title=title)
  
  
-class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE):
+class TwitchPlaylistBaseIE(TwitchBaseIE):
+    _PAGE_LIMIT = 100
+
      def _entries(self, channel_name, *args):
          cursor = None
          variables_common = self._make_variables(channel_name, *args)
@@ -440,49 +458,6 @@ def _entries(self, channel_name, *args):
              if not cursor or not isinstance(cursor, compat_str):
                  break
  
-    # Deprecated kraken v5 API
-    def _entries_kraken(self, channel_name, broadcast_type, sort):
-        access_token = self._download_access_token(channel_name)
-        channel_id = self._extract_channel_id(access_token['token'], channel_name)
-        offset = 0
-        counter_override = None
-        for counter in itertools.count(1):
-            response = self._call_api(
-                'kraken/channels/%s/videos/' % channel_id,
-                channel_id,
-                'Downloading video JSON page %s' % (counter_override or counter),
-                query={
-                    'offset': offset,
-                    'limit': self._PAGE_LIMIT,
-                    'broadcast_type': broadcast_type,
-                    'sort': sort,
-                })
-            videos = response.get('videos')
-            if not isinstance(videos, list):
-                break
-            for video in videos:
-                if not isinstance(video, dict):
-                    continue
-                video_url = url_or_none(video.get('url'))
-                if not video_url:
-                    continue
-                yield {
-                    '_type': 'url_transparent',
-                    'ie_key': TwitchVodIE.ie_key(),
-                    'id': video.get('_id'),
-                    'url': video_url,
-                    'title': video.get('title'),
-                    'description': video.get('description'),
-                    'timestamp': unified_timestamp(video.get('published_at')),
-                    'duration': float_or_none(video.get('length')),
-                    'view_count': int_or_none(video.get('views')),
-                    'language': video.get('language'),
-                }
-            offset += self._PAGE_LIMIT
-            total = int_or_none(response.get('_total'))
-            if total and offset >= total:
-                break
-
  
  class TwitchVideosIE(TwitchPlaylistBaseIE):
      _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)'
@@ -724,7 +699,7 @@ def _real_extract(self, url):
              playlist_title='%s - Collections' % channel_name)
  
  
-class TwitchStreamIE(TwitchGraphQLBaseIE):
+class TwitchStreamIE(TwitchBaseIE):
      IE_NAME = 'twitch:stream'
      _VALID_URL = r'''(?x)
                      https?://
@@ -814,8 +789,9 @@ def _real_extract(self, url):
          if not stream:
              raise ExtractorError('%s is offline' % channel_name, expected=True)
  
-        access_token = self._download_access_token(channel_name)
-        token = access_token['token']
+        access_token = self._download_access_token(
+            channel_name, 'stream', 'channelName')
+        token = access_token['value']
  
          stream_id = stream.get('id') or channel_name
          query = {
@@ -826,7 +802,7 @@ def _real_extract(self, url):
              'player': 'twitchweb',
              'playlist_include_framerate': 'true',
              'segment_preference': '4',
-            'sig': access_token['sig'].encode('utf-8'),
+            'sig': access_token['signature'].encode('utf-8'),
              'token': token.encode('utf-8'),
          }
          formats = self._extract_m3u8_formats(
@@ -912,8 +888,8 @@ class TwitchClipsIE(TwitchBaseIE):
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        clip = self._download_json(
-            'https://gql.twitch.tv/gql', video_id, data=json.dumps({
+        clip = self._download_base_gql(
+            video_id, {
                  'query': '''{
    clip(slug: "%s") {
      broadcaster {
@@ -937,10 +913,7 @@ def _real_extract(self, url):
      }
      viewCount
    }
-}''' % video_id,
-            }).encode(), headers={
-                'Client-ID': self._CLIENT_ID,
-            })['data']['clip']
+}''' % video_id}, 'Downloading clip GraphQL')['data']['clip']
  
          if not clip:
              raise ExtractorError(
diff --git a/youtube_dlc/extractor/twitter.py b/youtube_dlc/extractor/twitter.py

index ca5e040c6a69d1666a18e7bdf2cd1ef1c8b79c45..4602c0984184238ec24b4f5614c692309de5879c 100644 (file)
--- a/youtube_dlc/extractor/twitter.py
+++ b/youtube_dlc/extractor/twitter.py
@@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE):
          'info_dict': {
              'id': '700207533655363584',
              'ext': 'mp4',
-            'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel',
+            'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel',
              'description': 'BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
              'thumbnail': r're:^https?://.*\.jpg',
-            'uploader': 'simon vetugo',
+            'uploader': 'simon vertugo',
              'uploader_id': 'simonvertugo',
              'duration': 30.0,
              'timestamp': 1455777459,
@@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE):
              'timestamp': 1492000653,
              'upload_date': '20170412',
          },
+        'skip': 'Account suspended',
      }, {
          'url': 'https://twitter.com/i/web/status/910031516746514432',
          'info_dict': {
@@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE):
          # promo_video_website card
          'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
          'only_matching': True,
+    }, {
+        # promo_video_convo card
+        'url': 'https://twitter.com/poco_dandy/status/1047395834013384704',
+        'only_matching': True,
+    }, {
+        # appplayer card
+        'url': 'https://twitter.com/poco_dandy/status/1150646424461176832',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
@@ -462,7 +471,30 @@ def get_binding_value(k):
                      return try_get(o, lambda x: x[x['type'].lower() + '_value'])
  
                  card_name = card['name'].split(':')[-1]
-                if card_name in ('amplify', 'promo_video_website'):
+                if card_name == 'player':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('player_url'),
+                    })
+                elif card_name == 'periscope_broadcast':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('url') or get_binding_value('player_url'),
+                        'ie_key': PeriscopeIE.ie_key(),
+                    })
+                elif card_name == 'broadcast':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('broadcast_url'),
+                        'ie_key': TwitterBroadcastIE.ie_key(),
+                    })
+                elif card_name == 'summary':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('card_url'),
+                    })
+                # amplify, promo_video_website, promo_video_convo, appplayer, ...
+                else:
                      is_amplify = card_name == 'amplify'
                      vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
                      content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
@@ -488,25 +520,6 @@ def get_binding_value(k):
                          'duration': int_or_none(get_binding_value(
                              'content_duration_seconds')),
                      })
-                elif card_name == 'player':
-                    info.update({
-                        '_type': 'url',
-                        'url': get_binding_value('player_url'),
-                    })
-                elif card_name == 'periscope_broadcast':
-                    info.update({
-                        '_type': 'url',
-                        'url': get_binding_value('url') or get_binding_value('player_url'),
-                        'ie_key': PeriscopeIE.ie_key(),
-                    })
-                elif card_name == 'broadcast':
-                    info.update({
-                        '_type': 'url',
-                        'url': get_binding_value('broadcast_url'),
-                        'ie_key': TwitterBroadcastIE.ie_key(),
-                    })
-                else:
-                    raise ExtractorError('Unsupported Twitter Card.')
              else:
                  expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url'])
                  if not expanded_url:
diff --git a/youtube_dlc/extractor/xfileshare.py b/youtube_dlc/extractor/xfileshare.py

index 48ef07ed1415aff0e81f9029136d480c76e3c14c..cbd5d1cbbb331cbd14777e70950b9c3c2a41175c 100644 (file)
--- a/youtube_dlc/extractor/xfileshare.py
+++ b/youtube_dlc/extractor/xfileshare.py
@@ -45,6 +45,7 @@ def aa_decode(aa_code):
  
  class XFileShareIE(InfoExtractor):
      _SITES = (
+        (r'aparat\.cam', 'Aparat'),
          (r'clipwatching\.com', 'ClipWatching'),
          (r'gounlimited\.to', 'GoUnlimited'),
          (r'govid\.me', 'GoVid'),
@@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor):
              'title': 'sample',
              'thumbnail': r're:http://.*\.jpg',
          },
+    }, {
+        'url': 'https://aparat.cam/n4d6dh0wvlpr',
+        'only_matching': True,
      }]
  
      @staticmethod
diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py

index 6a04b710e3bc921943723c64a6344d0ba519e018..586ad4150af40c7d9376635aa50fb016ee183504 100644 (file)
--- a/youtube_dlc/utils.py
+++ b/youtube_dlc/utils.py
@@ -5819,3 +5819,20 @@ def format_field(obj, field, template='%s', ignore=(None, ''), default='', func=
      if func and val not in ignore:
          val = func(val)
      return template % val if val not in ignore else default
+
+
+def clean_podcast_url(url):
+    return re.sub(r'''(?x)
+        (?:
+            (?:
+                chtbl\.com/track|
+                media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
+                play\.podtrac\.com
+            )/[^/]+|
+            (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
+            flex\.acast\.com|
+            pd(?:
+                cn\.co| # https://podcorn.com/analytics-prefix/
+                st\.fm # https://podsights.com/docs/
+            )/e
+        )/''', '', url)
author	pukkandan <redacted>
	Fri, 8 Jan 2021 16:14:50 +0000 (21:44 +0530)
committer	pukkandan <redacted>
	Fri, 8 Jan 2021 16:29:10 +0000 (21:59 +0530)
docs/supportedsites.md		patch \| blob \| blame \| history
test/test_subtitles.py		patch \| blob \| blame \| history
test/test_utils.py		patch \| blob \| blame \| history
youtube_dlc/downloader/hls.py		patch \| blob \| blame \| history
youtube_dlc/extractor/acast.py		patch \| blob \| blame \| history
youtube_dlc/extractor/applepodcasts.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/bfmtv.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/bibeltv.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/canvas.py		patch \| blob \| blame \| history
youtube_dlc/extractor/dplay.py		patch \| blob \| blame \| history
youtube_dlc/extractor/extractors.py		patch \| blob \| blame \| history
youtube_dlc/extractor/googleplus.py	[deleted file]	patch \| blob \| blame \| history
youtube_dlc/extractor/googlepodcasts.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/iheart.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/ketnet.py		patch \| blob \| blame \| history
youtube_dlc/extractor/motherless.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nrk.py		patch \| blob \| blame \| history
youtube_dlc/extractor/rai.py		patch \| blob \| blame \| history
youtube_dlc/extractor/sbs.py		patch \| blob \| blame \| history
youtube_dlc/extractor/stv.py		patch \| blob \| blame \| history
youtube_dlc/extractor/twitch.py		patch \| blob \| blame \| history
youtube_dlc/extractor/twitter.py		patch \| blob \| blame \| history
youtube_dlc/extractor/xfileshare.py		patch \| blob \| blame \| history
youtube_dlc/utils.py		patch \| blob \| blame \| history