Update to ytdl-2021.01.03

author pukkandan <redacted>

Fri, 1 Jan 2021 12:26:37 +0000 (17:56 +0530)

committer pukkandan <redacted>

Mon, 4 Jan 2021 18:32:27 +0000 (00:02 +0530)
author pukkandan <redacted>
Fri, 1 Jan 2021 12:26:37 +0000 (17:56 +0530)
committer pukkandan <redacted>
Mon, 4 Jan 2021 18:32:27 +0000 (00:02 +0530)
diff --git a/README.md b/README.md

index ab1c0547b8ca7d393f4ca7454f2f24bbf370864c..681157f6d6efbfb0d86ea2b788cd470b6d17a05a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -493,7 +493,7 @@ ## Authentication Options:
                                       out, youtube-dlc will ask interactively.
      -2, --twofactor TWOFACTOR        Two-factor authentication code
      -n, --netrc                      Use .netrc authentication data
-    --video-password PASSWORD        Video password (vimeo, smotri, youku)
+    --video-password PASSWORD        Video password (vimeo, youku)
  
  ## Adobe Pass Options:
      --ap-mso MSO                     Adobe Pass multiple-system operator (TV
@@ -846,6 +846,7 @@ ## Filtering Formats
   - `container`: Name of the container format
   - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`)
   - `format_id`: A short description of the format
+ - `language`: Language code
  
  Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain).
  
diff --git a/docs/supportedsites.md b/docs/supportedsites.md

index 0b183b272ad9d214d3da809626b2246a6a493124..8aede26a911703a29ffc31f8ff4f3fd104ae24ee 100644 (file)
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -34,6 +34,8 @@ # Supported sites
   - **adobetv:video**
   - **AdultSwim**
   - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault
+ - **aenetworks:collection**
+ - **aenetworks:show**
   - **afreecatv**: afreecatv.com
   - **AirMozilla**
   - **AliExpressLive**
@@ -55,6 +57,7 @@ # Supported sites
   - **appletrailers**
   - **appletrailers:section**
   - **archive.org**: archive.org videos
+ - **ArcPublishing**
   - **ARD**
   - **ARD:mediathek**
   - **ARDBetaMediathek**
@@ -101,6 +104,7 @@ # Supported sites
   - **BilibiliAudioAlbum**
   - **BiliBiliPlayer**
   - **BioBioChileTV**
+ - **Biography**
   - **BIQLE**
   - **BitChute**
   - **BitChuteChannel**
@@ -110,6 +114,7 @@ # Supported sites
   - **blinkx**
   - **Bloomberg**
   - **BokeCC**
+ - **BongaCams**
   - **BostonGlobe**
   - **Box**
   - **Bpb**: Bundeszentrale für politische Bildung
@@ -144,6 +149,7 @@ # Supported sites
   - **CBS**
   - **CBSInteractive**
   - **CBSLocal**
+ - **CBSLocalArticle**
   - **cbsnews**: CBS News
   - **cbsnews:embed**
   - **cbsnews:livevideo**: CBS News Live Videos
@@ -193,9 +199,9 @@ # Supported sites
   - **CrooksAndLiars**
   - **crunchyroll**
   - **crunchyroll:playlist**
- - **CSNNE**
   - **CSpan**: C-SPAN
   - **CtsNews**: 華視新聞
+ - **CTV**
   - **CTVNews**
   - **cu.ntv.co.jp**: Nippon Television Network
   - **Culturebox**
@@ -271,7 +277,6 @@ # Supported sites
   - **ESPNArticle**
   - **EsriVideo**
   - **Europa**
- - **EveryonesMixtape**
   - **EWETV**
   - **ExpoTV**
   - **Expressen**
@@ -313,11 +318,11 @@ # Supported sites
   - **FrontendMasters**
   - **FrontendMastersCourse**
   - **FrontendMastersLesson**
+ - **FujiTVFODPlus7**
   - **Funimation**
   - **Funk**
   - **Fusion**
   - **Fux**
- - **FXNetworks**
   - **Gaia**
   - **GameInformer**
   - **GameSpot**
@@ -350,6 +355,7 @@ # Supported sites
   - **hgtv.com:show**
   - **HiDive**
   - **HistoricFilms**
+ - **history:player**
   - **history:topic**: History.com Topic
   - **hitbox**
   - **hitbox:live**
@@ -403,7 +409,6 @@ # Supported sites
   - **JWPlatform**
   - **Kakao**
   - **Kaltura**
- - **KanalPlay**: Kanal 5/9/11 Play
   - **Kankan**
   - **Karaoketv**
   - **KarriereVideos**
@@ -427,7 +432,8 @@ # Supported sites
   - **la7.it**
   - **laola1tv**
   - **laola1tv:embed**
- - **lbry.tv**
+ - **lbry**
+ - **lbry:channel**
   - **LCI**
   - **Lcp**
   - **LcpPlay**
@@ -493,6 +499,7 @@ # Supported sites
   - **META**
   - **metacafe**
   - **Metacritic**
+ - **mewatch**
   - **Mgoon**
   - **MGTV**: 芒果TV
   - **MiaoPai**
@@ -503,8 +510,6 @@ # Supported sites
   - **mixcloud**
   - **mixcloud:playlist**
   - **mixcloud:user**
- - **Mixer:live**
- - **Mixer:vod**
   - **MLB**
   - **Mnet**
   - **MNetTV**
@@ -547,6 +552,11 @@ # Supported sites
   - **Naver**
   - **Naver:live**
   - **NBA**
+ - **nba:watch**
+ - **nba:watch:collection**
+ - **NBAChannel**
+ - **NBAEmbed**
+ - **NBAWatchEmbed**
   - **NBC**
   - **NBCNews**
   - **nbcolympics**
@@ -576,8 +586,10 @@ # Supported sites
   - **NextTV**: 壹電視
   - **Nexx**
   - **NexxEmbed**
- - **nfl.com**
+ - **nfl.com** (Currently broken)
+ - **nfl.com:article** (Currently broken)
   - **NhkVod**
+ - **NhkVodProgram**
   - **nhl.com**
   - **nick.com**
   - **nick.de**
@@ -592,7 +604,6 @@ # Supported sites
   - **njoy:embed**
   - **NJPWWorld**: 新日本プロレスワールド
   - **NobelPrize**
- - **Noco**
   - **NonkTube**
   - **Noovo**
   - **Normalboots**
@@ -610,6 +621,7 @@ # Supported sites
   - **Npr**
   - **NRK**
   - **NRKPlaylist**
+ - **NRKRadioPodkast**
   - **NRKSkole**: NRK Skole
   - **NRKTV**: NRK TV and NRK Radio
   - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte
@@ -719,6 +731,7 @@ # Supported sites
   - **qqmusic:singer**: QQ音乐 - 歌手
   - **qqmusic:toplist**: QQ音乐 - 排行榜
   - **QuantumTV**
+ - **Qub**
   - **Quickline**
   - **QuicklineLive**
   - **R7**
@@ -811,18 +824,17 @@ # Supported sites
   - **Shared**: shared.sx
   - **ShowRoomLive**
   - **Sina**
+ - **sky.it**
+ - **sky:news**
+ - **sky:sports**
+ - **sky:sports:news**
+ - **skyacademy.it**
   - **SkylineWebcams**
- - **SkyNews**
   - **skynewsarabia:article**
   - **skynewsarabia:video**
- - **SkySports**
   - **Slideshare**
   - **SlidesLive**
   - **Slutload**
- - **smotri**: Smotri.com
- - **smotri:broadcast**: Smotri.com broadcasts
- - **smotri:community**: Smotri.com community videos
- - **smotri:user**: Smotri.com user videos
   - **Snotr**
   - **Sohu**
   - **SonyLIV**
@@ -883,7 +895,6 @@ # Supported sites
   - **Tagesschau**
   - **tagesschau:player**
   - **Tass**
- - **TastyTrade**
   - **TBS**
   - **TDSLifeway**
   - **Teachable**
@@ -906,6 +917,7 @@ # Supported sites
   - **TeleQuebecEmission**
   - **TeleQuebecLive**
   - **TeleQuebecSquat**
+ - **TeleQuebecVideo**
   - **TeleTask**
   - **Telewebion**
   - **TennisTV**
@@ -923,6 +935,7 @@ # Supported sites
   - **ThisAV**
   - **ThisOldHouse**
   - **TikTok**
+ - **TikTokUser** (Currently broken)
   - **tinypic**: tinypic.com videos
   - **TMZ**
   - **TMZArticle**
@@ -955,12 +968,15 @@ # Supported sites
   - **TV2DKBornholmPlay**
   - **TV4**: tv4.se and tv4play.se
   - **TV5MondePlus**: TV5MONDE+
+ - **tv5unis**
+ - **tv5unis:video**
   - **tv8.it**
   - **TVA**
   - **TVANouvelles**
   - **TVANouvellesArticle**
   - **TVC**
   - **TVCArticle**
+ - **TVer**
   - **tvigle**: Интернет-телевидение Tvigle.ru
   - **tvland.com**
   - **TVN24**
@@ -1089,6 +1105,7 @@ # Supported sites
   - **vube**: Vube.com
   - **VuClip**
   - **VVVVID**
+ - **VVVVIDShow**
   - **VyboryMos**
   - **Vzaar**
   - **Wakanim**
@@ -1111,6 +1128,7 @@ # Supported sites
   - **WeiboMobile**
   - **WeiqiTV**: WQTV
   - **Wistia**
+ - **WistiaPlaylist**
   - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
   - **WorldStarHipHop**
   - **WSJ**: Wall Street Journal
@@ -1142,6 +1160,8 @@ # Supported sites
   - **yahoo:japannews**: Yahoo! Japan News
   - **YandexDisk**
   - **yandexmusic:album**: Яндекс.Музыка - Альбом
+ - **yandexmusic:artist:albums**: Яндекс.Музыка - Артист - Альбомы
+ - **yandexmusic:artist:tracks**: Яндекс.Музыка - Артист - Треки
   - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
   - **yandexmusic:track**: Яндекс.Музыка - Трек
   - **YandexVideo**
@@ -1169,9 +1189,9 @@ # Supported sites
   - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)
   - **youtube:tab**: YouTube.com tab
   - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
+ - **YoutubeYtBe**: youtu.be
   - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword
   - **Zapiks**
- - **Zaq1**
   - **Zattoo**
   - **ZattooLive**
   - **ZDF-3sat**
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py

index bdd01e41a3e767b0e75dfc23a204ecd7e727d535..22e3d26a78bca566338ba6ab5aeafb753a896a87 100644 (file)
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -98,6 +98,55 @@ def test_html_search_meta(self):
          self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
          self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
  
+    def test_search_json_ld_realworld(self):
+        # https://github.com/ytdl-org/youtube-dl/issues/23306
+        expect_dict(
+            self,
+            self.ie._search_json_ld(r'''<script type="application/ld+json">
+{
+"@context": "http://schema.org/",
+"@type": "VideoObject",
+"name": "1 On 1 With Kleio",
+"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/",
+"duration": "PT0H12M23S",
+"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"],
+"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4",
+"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/",
+"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg",
+"width": "1920",
+"height": "1080",
+"encodingFormat": "mp4",
+"bitrate": "6617kbps",
+"isFamilyFriendly": "False",
+"description": "Kleio Valentien",
+"uploadDate": "2015-12-05T21:24:35+01:00",
+"interactionStatistic": {
+"@type": "InteractionCounter",
+"interactionType": { "@type": "http://schema.org/WatchAction" },
+"userInteractionCount": 1120958
+}, "aggregateRating": {
+"@type": "AggregateRating",
+"ratingValue": "88",
+"ratingCount": "630",
+"bestRating": "100",
+"worstRating": "0"
+}, "actor": [{
+"@type": "Person",
+"name": "Kleio Valentien",
+"url": "https://www.eporner.com/pornstar/kleio-valentien/"
+}]}
+</script>''', None),
+            {
+                'title': '1 On 1 With Kleio',
+                'description': 'Kleio Valentien',
+                'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+                'timestamp': 1449347075,
+                'duration': 743.0,
+                'view_count': 1120958,
+                'width': 1920,
+                'height': 1080,
+            })
+
      def test_download_json(self):
          uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
          self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'})
@@ -108,6 +157,18 @@ def test_download_json(self):
          self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
  
      def test_parse_html5_media_entries(self):
+        # inline video tag
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://127.0.0.1/video.html',
+                r'<html><video src="/vid.mp4" /></html>', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://127.0.0.1/vid.mp4',
+                }],
+            })
+
          # from https://www.r18.com/
          # with kpbs in label
          expect_dict(
diff --git a/test/test_all_urls.py b/test/test_all_urls.py

index 8dcdc4e588aea5be944d6dfbd767bf8b7a1bec04..130038c0d9a5127d5a742ddb914c22f4784413d6 100644 (file)
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -39,7 +39,7 @@ def test_youtube_playlist_matching(self):
          assertTab('https://www.youtube.com/embedded')
          assertTab('https://www.youtube.com/feed')  # Own channel's home page
          assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
-        assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+        assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
          assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
          assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')  # 668
          self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
@@ -60,8 +60,8 @@ def test_youtube_channel_matching(self):
          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
  
-    # def test_youtube_user_matching(self):
-    #     self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
+    def test_youtube_user_matching(self):
+        self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
  
      def test_youtube_feeds(self):
          self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
diff --git a/test/test_utils.py b/test/test_utils.py

index 6562d443af8ea8b4e9177c7305a4a67b580382bc..bb69b052204d36e0112947261c286d6cea6f7fc5 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -555,6 +555,11 @@ def test_url_or_none(self):
          self.assertEqual(url_or_none('http$://foo.de'), None)
          self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
          self.assertEqual(url_or_none('//foo.de'), '//foo.de')
+        self.assertEqual(url_or_none('s3://foo.de'), None)
+        self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de')
+        self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de')
+        self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de')
+        self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de')
  
      def test_parse_age_limit(self):
          self.assertEqual(parse_age_limit(None), None)
diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py

index 2cc02e46fad4e8f28447e789a6f3dc491e8c7d0a..715eaa7dc49ee8f126627969683d684a24687fde 100644 (file)
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -1705,7 +1705,7 @@ def is_wellformed(f):
          if req_format is None:
              req_format = self._default_format_spec(info_dict, download=download)
              if self.params.get('verbose'):
-                self.to_stdout('[debug] Default format spec: %s' % req_format)
+                self._write_string('[debug] Default format spec: %s\n' % req_format)
  
          format_selector = self.build_format_selector(req_format)
  
@@ -1919,7 +1919,7 @@ def dl(name, info, subtitle=False):
              for ph in self._progress_hooks:
                  fd.add_progress_hook(ph)
              if self.params.get('verbose'):
-                self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
+                self.to_screen('[debug] Invoking downloader on %r' % info.get('url'))
              return fd.download(name, info, subtitle)
  
          subtitles_are_requested = any([self.params.get('writesubtitles', False),
@@ -2635,7 +2635,7 @@ def _write_thumbnails(self, info_dict, filename):
              thumb_ext = determine_ext(t['url'], 'jpg')
              suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
              thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
-            t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+            t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext'))
  
              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
                  self.to_screen('[%s] %s: Thumbnail %sis already present' %
diff --git a/youtube_dlc/downloader/hls.py b/youtube_dlc/downloader/hls.py

index 0f2c06f40414fd94154c69d2aad365880dcb0e20..5e1ff4f6b2eda06d998851b6dbfe610e232a68be 100644 (file)
--- a/youtube_dlc/downloader/hls.py
+++ b/youtube_dlc/downloader/hls.py
@@ -42,11 +42,13 @@ def can_download(manifest, info_dict):
              # no segments will definitely be appended to the end of the playlist.
              # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
              #                                 # event media playlists [4]
+            r'#EXT-X-MAP:',  # media initialization [5]
  
              # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
              # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
              # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
              # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
+            # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
          )
          check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
          is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest
diff --git a/youtube_dlc/extractor/acast.py b/youtube_dlc/extractor/acast.py

index b17c792d23a824bb3684d250071efdf42135a2b0..60378db1be3eeabb1a2907754588442f22dd0e08 100644 (file)
--- a/youtube_dlc/extractor/acast.py
+++ b/youtube_dlc/extractor/acast.py
@@ -2,21 +2,47 @@
  from __future__ import unicode_literals
  
  import re
-import functools
  
  from .common import InfoExtractor
-from ..compat import compat_str
  from ..utils import (
      clean_html,
-    float_or_none,
      int_or_none,
-    try_get,
-    unified_timestamp,
-    OnDemandPagedList,
+    parse_iso8601,
  )
  
  
-class ACastIE(InfoExtractor):
+class ACastBaseIE(InfoExtractor):
+    def _extract_episode(self, episode, show_info):
+        title = episode['title']
+        info = {
+            'id': episode['id'],
+            'display_id': episode.get('episodeUrl'),
+            'url': episode['url'],
+            'title': title,
+            'description': clean_html(episode.get('description') or episode.get('summary')),
+            'thumbnail': episode.get('image'),
+            'timestamp': parse_iso8601(episode.get('publishDate')),
+            'duration': int_or_none(episode.get('duration')),
+            'filesize': int_or_none(episode.get('contentLength')),
+            'season_number': int_or_none(episode.get('season')),
+            'episode': title,
+            'episode_number': int_or_none(episode.get('episode')),
+        }
+        info.update(show_info)
+        return info
+
+    def _extract_show_info(self, show):
+        return {
+            'creator': show.get('author'),
+            'series': show.get('title'),
+        }
+
+    def _call_api(self, path, video_id, query=None):
+        return self._download_json(
+            'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query)
+
+
+class ACastIE(ACastBaseIE):
      IE_NAME = 'acast'
      _VALID_URL = r'''(?x)
                      https?://
@@ -28,15 +54,15 @@ class ACastIE(InfoExtractor):
                      '''
      _TESTS = [{
          'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
-        'md5': '16d936099ec5ca2d5869e3a813ee8dc4',
+        'md5': 'f5598f3ad1e4776fed12ec1407153e4b',
          'info_dict': {
              'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
              'ext': 'mp3',
              'title': '2. Raggarmordet - Röster ur det förflutna',
-            'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4',
+            'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67',
              'timestamp': 1477346700,
              'upload_date': '20161024',
-            'duration': 2766.602563,
+            'duration': 2766,
              'creator': 'Anton Berg & Martin Johnson',
              'series': 'Spår',
              'episode': '2. Raggarmordet - Röster ur det förflutna',
@@ -45,7 +71,7 @@ class ACastIE(InfoExtractor):
          'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
          'only_matching': True,
      }, {
-        'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
+        'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2',
          'only_matching': True,
      }, {
          'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
@@ -54,40 +80,14 @@ class ACastIE(InfoExtractor):
  
      def _real_extract(self, url):
          channel, display_id = re.match(self._VALID_URL, url).groups()
-        s = self._download_json(
-            'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id),
-            display_id)
-        media_url = s['url']
-        if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id):
-            episode_url = s.get('episodeUrl')
-            if episode_url:
-                display_id = episode_url
-            else:
-                channel, display_id = re.match(self._VALID_URL, s['link']).groups()
-        cast_data = self._download_json(
-            'https://play-api.acast.com/splash/%s/%s' % (channel, display_id),
-            display_id)['result']
-        e = cast_data['episode']
-        title = e.get('name') or s['title']
-        return {
-            'id': compat_str(e['id']),
-            'display_id': display_id,
-            'url': media_url,
-            'title': title,
-            'description': e.get('summary') or clean_html(e.get('description') or s.get('description')),
-            'thumbnail': e.get('image'),
-            'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')),
-            'duration': float_or_none(e.get('duration') or s.get('duration')),
-            'filesize': int_or_none(e.get('contentLength')),
-            'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str),
-            'series': try_get(cast_data, lambda x: x['show']['name'], compat_str),
-            'season_number': int_or_none(e.get('seasonNumber')),
-            'episode': title,
-            'episode_number': int_or_none(e.get('episodeNumber')),
-        }
+        episode = self._call_api(
+            '%s/episodes/%s' % (channel, display_id),
+            display_id, {'showInfo': 'true'})
+        return self._extract_episode(
+            episode, self._extract_show_info(episode.get('show') or {}))
  
  
-class ACastChannelIE(InfoExtractor):
+class ACastChannelIE(ACastBaseIE):
      IE_NAME = 'acast:channel'
      _VALID_URL = r'''(?x)
                      https?://
@@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor):
          'info_dict': {
              'id': '4efc5294-5385-4847-98bd-519799ce5786',
              'title': 'Today in Focus',
-            'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
+            'description': 'md5:c09ce28c91002ce4ffce71d6504abaae',
          },
-        'playlist_mincount': 35,
+        'playlist_mincount': 200,
      }, {
          'url': 'http://play.acast.com/s/ft-banking-weekly',
          'only_matching': True,
      }]
-    _API_BASE_URL = 'https://play.acast.com/api/'
-    _PAGE_SIZE = 10
  
      @classmethod
      def suitable(cls, url):
          return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
  
-    def _fetch_page(self, channel_slug, page):
-        casts = self._download_json(
-            self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
-            channel_slug, note='Download page %d of channel data' % page)
-        for cast in casts:
-            yield self.url_result(
-                'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
-                'ACast', cast['id'])
-
      def _real_extract(self, url):
-        channel_slug = self._match_id(url)
-        channel_data = self._download_json(
-            self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
-        entries = OnDemandPagedList(functools.partial(
-            self._fetch_page, channel_slug), self._PAGE_SIZE)
-        return self.playlist_result(entries, compat_str(
-            channel_data['id']), channel_data['name'], channel_data.get('description'))
+        show_slug = self._match_id(url)
+        show = self._call_api(show_slug, show_slug)
+        show_info = self._extract_show_info(show)
+        entries = []
+        for episode in (show.get('episodes') or []):
+            entries.append(self._extract_episode(episode, show_info))
+        return self.playlist_result(
+            entries, show.get('id'), show.get('title'), show.get('description'))
diff --git a/youtube_dlc/extractor/aenetworks.py b/youtube_dlc/extractor/aenetworks.py

index 611b948f51062edea4ccd757471782cffb33bddd..8e4963131731d31e8210550792e7667b8143f145 100644 (file)
--- a/youtube_dlc/extractor/aenetworks.py
+++ b/youtube_dlc/extractor/aenetworks.py
@@ -5,20 +5,32 @@
  
  from .theplatform import ThePlatformIE
  from ..utils import (
-    extract_attributes,
      ExtractorError,
+    GeoRestrictedError,
      int_or_none,
-    smuggle_url,
      update_url_query,
-)
-from ..compat import (
-    compat_urlparse,
+    urlencode_postdata,
  )
  
  
  class AENetworksBaseIE(ThePlatformIE):
+    _BASE_URL_REGEX = r'''(?x)https?://
+        (?:(?:www|play|watch)\.)?
+        (?P<domain>
+            (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
+            fyi\.tv
+        )/'''
      _THEPLATFORM_KEY = 'crazyjava'
      _THEPLATFORM_SECRET = 's3cr3t'
+    _DOMAIN_MAP = {
+        'history.com': ('HISTORY', 'history'),
+        'aetv.com': ('AETV', 'aetv'),
+        'mylifetime.com': ('LIFETIME', 'lifetime'),
+        'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'),
+        'fyi.tv': ('FYI', 'fyi'),
+        'historyvault.com': (None, 'historyvault'),
+        'biography.com': (None, 'biography'),
+    }
  
      def _extract_aen_smil(self, smil_url, video_id, auth=None):
          query = {'mbr': 'true'}
@@ -31,7 +43,7 @@ def _extract_aen_smil(self, smil_url, video_id, auth=None):
              'assetTypes': 'high_video_s3'
          }, {
              'assetTypes': 'high_video_s3',
-            'switch': 'hls_ingest_fastly'
+            'switch': 'hls_high_fastly',
          }]
          formats = []
          subtitles = {}
@@ -44,6 +56,8 @@ def _extract_aen_smil(self, smil_url, video_id, auth=None):
                  tp_formats, tp_subtitles = self._extract_theplatform_smil(
                      m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes']))
              except ExtractorError as e:
+                if isinstance(e, GeoRestrictedError):
+                    raise
                  last_e = e
                  continue
              formats.extend(tp_formats)
@@ -57,24 +71,45 @@ def _extract_aen_smil(self, smil_url, video_id, auth=None):
              'subtitles': subtitles,
          }
  
+    def _extract_aetn_info(self, domain, filter_key, filter_value, url):
+        requestor_id, brand = self._DOMAIN_MAP[domain]
+        result = self._download_json(
+            'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand,
+            filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0]
+        title = result['title']
+        video_id = result['id']
+        media_url = result['publicUrl']
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        auth = None
+        if theplatform_metadata.get('AETN$isBehindWall'):
+            resource = self._get_mvpd_resource(
+                requestor_id, theplatform_metadata['title'],
+                theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
+                theplatform_metadata['ratings'][0]['rating'])
+            auth = self._extract_mvpd_auth(
+                url, video_id, requestor_id, resource)
+        info.update(self._extract_aen_smil(media_url, video_id, auth))
+        info.update({
+            'title': title,
+            'series': result.get('seriesName'),
+            'season_number': int_or_none(result.get('tvSeasonNumber')),
+            'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')),
+        })
+        return info
+
  
  class AENetworksIE(AENetworksBaseIE):
      IE_NAME = 'aenetworks'
      IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
-    _VALID_URL = r'''(?x)
-                    https?://
-                        (?:www\.)?
-                        (?P<domain>
-                            (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
-                            fyi\.tv
-                        )/
-                        (?:
-                            shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
-                            movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
-                            specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)|
-                            collections/[^/]+/(?P<collection_display_id>[^/]+)
-                        )
-                    '''
+    _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id>
+        shows/[^/]+/season-\d+/episode-\d+|
+        (?:
+            (?:movie|special)s/[^/]+|
+            (?:shows/[^/]+/)?videos
+        )/[^/?#&]+
+    )'''
      _TESTS = [{
          'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
          'info_dict': {
@@ -91,22 +126,23 @@ class AENetworksIE(AENetworksBaseIE):
              'skip_download': True,
          },
          'add_ie': ['ThePlatform'],
+        'skip': 'This video is only available for users of participating TV providers.',
      }, {
-        'url': 'http://www.history.com/shows/ancient-aliens/season-1',
+        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
          'info_dict': {
-            'id': '71889446852',
+            'id': '600587331957',
+            'ext': 'mp4',
+            'title': 'Inlawful Entry',
+            'description': 'md5:57c12115a2b384d883fe64ca50529e08',
+            'timestamp': 1452634428,
+            'upload_date': '20160112',
+            'uploader': 'AENE-NEW',
          },
-        'playlist_mincount': 5,
-    }, {
-        'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
-        'info_dict': {
-            'id': 'SERIES4317',
-            'title': 'Atlanta Plastic',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
          },
-        'playlist_mincount': 2,
-    }, {
-        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
-        'only_matching': True
+        'add_ie': ['ThePlatform'],
      }, {
          'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
          'only_matching': True
@@ -117,78 +153,125 @@ class AENetworksIE(AENetworksBaseIE):
          'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
          'only_matching': True
      }, {
-        'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us',
+        'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie',
          'only_matching': True
      }, {
          'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
          'only_matching': True
      }, {
-        'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward',
+        'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story',
+        'only_matching': True
+    }, {
+        'url': 'http://www.history.com/videos/history-of-valentines-day',
          'only_matching': True
      }, {
-        'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story',
+        'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape',
          'only_matching': True
      }]
-    _DOMAIN_TO_REQUESTOR_ID = {
-        'history.com': 'HISTORY',
-        'aetv.com': 'AETV',
-        'mylifetime.com': 'LIFETIME',
-        'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB',
-        'fyi.tv': 'FYI',
-    }
  
      def _real_extract(self, url):
-        domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
-        display_id = show_path or movie_display_id or special_display_id or collection_display_id
-        webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())
-        if show_path:
-            url_parts = show_path.split('/')
-            url_parts_len = len(url_parts)
-            if url_parts_len == 1:
-                entries = []
-                for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
-                    entries.append(self.url_result(
-                        compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
-                if entries:
-                    return self.playlist_result(
-                        entries, self._html_search_meta('aetn:SeriesId', webpage),
-                        self._html_search_meta('aetn:SeriesTitle', webpage))
-                else:
-                    # single season
-                    url_parts_len = 2
-            if url_parts_len == 2:
-                entries = []
-                for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
-                    episode_attributes = extract_attributes(episode_item)
-                    episode_url = compat_urlparse.urljoin(
-                        url, episode_attributes['data-canonical'])
-                    entries.append(self.url_result(
-                        episode_url, 'AENetworks',
-                        episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
-                return self.playlist_result(
-                    entries, self._html_search_meta('aetn:SeasonId', webpage))
-
-        video_id = self._html_search_meta('aetn:VideoID', webpage)
-        media_url = self._search_regex(
-            [r"media_url\s*=\s*'(?P<url>[^']+)'",
-             r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
-             r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
-            webpage, 'video url', group='url')
-        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
-            r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
-        info = self._parse_theplatform_metadata(theplatform_metadata)
-        auth = None
-        if theplatform_metadata.get('AETN$isBehindWall'):
-            requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
-            resource = self._get_mvpd_resource(
-                requestor_id, theplatform_metadata['title'],
-                theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
-                theplatform_metadata['ratings'][0]['rating'])
-            auth = self._extract_mvpd_auth(
-                url, video_id, requestor_id, resource)
-        info.update(self._search_json_ld(webpage, video_id, fatal=False))
-        info.update(self._extract_aen_smil(media_url, video_id, auth))
-        return info
+        domain, canonical = re.match(self._VALID_URL, url).groups()
+        return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url)
+
+
+class AENetworksListBaseIE(AENetworksBaseIE):
+    def _call_api(self, resource, slug, brand, fields):
+        return self._download_json(
+            'https://yoga.appsvcs.aetnd.com/graphql',
+            slug, query={'brand': brand}, data=urlencode_postdata({
+                'query': '''{
+  %s(slug: "%s") {
+    %s
+  }
+}''' % (resource, slug, fields),
+            }))['data'][resource]
+
+    def _real_extract(self, url):
+        domain, slug = re.match(self._VALID_URL, url).groups()
+        _, brand = self._DOMAIN_MAP[domain]
+        playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS)
+        base_url = 'http://watch.%s' % domain
+
+        entries = []
+        for item in (playlist.get(self._ITEMS_KEY) or []):
+            doc = self._get_doc(item)
+            canonical = doc.get('canonical')
+            if not canonical:
+                continue
+            entries.append(self.url_result(
+                base_url + canonical, AENetworksIE.ie_key(), doc.get('id')))
+
+        description = None
+        if self._PLAYLIST_DESCRIPTION_KEY:
+            description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY)
+
+        return self.playlist_result(
+            entries, playlist.get('id'),
+            playlist.get(self._PLAYLIST_TITLE_KEY), description)
+
+
+class AENetworksCollectionIE(AENetworksListBaseIE):
+    IE_NAME = 'aenetworks:collection'
+    _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)'
+    _TESTS = [{
+        'url': 'https://watch.historyvault.com/list/america-the-story-of-us',
+        'info_dict': {
+            'id': '282',
+            'title': 'America The Story of Us',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us',
+        'only_matching': True
+    }, {
+        'url': 'https://www.historyvault.com/collections/mysteryquest',
+        'only_matching': True
+    }]
+    _RESOURCE = 'list'
+    _ITEMS_KEY = 'items'
+    _PLAYLIST_TITLE_KEY = 'display_title'
+    _PLAYLIST_DESCRIPTION_KEY = None
+    _FIELDS = '''id
+    display_title
+    items {
+      ... on ListVideoItem {
+        doc {
+          canonical
+          id
+        }
+      }
+    }'''
+
+    def _get_doc(self, item):
+        return item.get('doc') or {}
+
+
+class AENetworksShowIE(AENetworksListBaseIE):
+    IE_NAME = 'aenetworks:show'
+    _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)'
+    _TESTS = [{
+        'url': 'http://www.history.com/shows/ancient-aliens',
+        'info_dict': {
+            'id': 'SH012427480000',
+            'title': 'Ancient Aliens',
+            'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f',
+        },
+        'playlist_mincount': 168,
+    }]
+    _RESOURCE = 'series'
+    _ITEMS_KEY = 'episodes'
+    _PLAYLIST_TITLE_KEY = 'title'
+    _PLAYLIST_DESCRIPTION_KEY = 'description'
+    _FIELDS = '''description
+    id
+    title
+    episodes {
+      canonical
+      id
+    }'''
+
+    def _get_doc(self, item):
+        return item
  
  
  class HistoryTopicIE(AENetworksBaseIE):
@@ -204,6 +287,7 @@ class HistoryTopicIE(AENetworksBaseIE):
              'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
              'timestamp': 1375819729,
              'upload_date': '20130806',
+            'uploader': 'AENE-NEW',
          },
          'params': {
              # m3u8 download
@@ -212,36 +296,47 @@ class HistoryTopicIE(AENetworksBaseIE):
          'add_ie': ['ThePlatform'],
      }]
  
-    def theplatform_url_result(self, theplatform_url, video_id, query):
-        return {
-            '_type': 'url_transparent',
-            'id': video_id,
-            'url': smuggle_url(
-                update_url_query(theplatform_url, query),
-                {
-                    'sig': {
-                        'key': self._THEPLATFORM_KEY,
-                        'secret': self._THEPLATFORM_SECRET,
-                    },
-                    'force_smil_url': True
-                }),
-            'ie_key': 'ThePlatform',
-        }
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        return self.url_result(
+            'http://www.history.com/videos/' + display_id,
+            AENetworksIE.ie_key())
+
+
+class HistoryPlayerIE(AENetworksBaseIE):
+    IE_NAME = 'history:player'
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)'
+    _TESTS = []
+
+    def _real_extract(self, url):
+        domain, video_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_aetn_info(domain, 'id', video_id, url)
+
+
+class BiographyIE(AENetworksBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808',
+        'info_dict': {
+            'id': '30322987',
+            'ext': 'mp4',
+            'title': 'Vincent Van Gogh - Full Episode',
+            'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.',
+            'timestamp': 1311970571,
+            'upload_date': '20110729',
+            'uploader': 'AENE-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }]
  
      def _real_extract(self, url):
          display_id = self._match_id(url)
          webpage = self._download_webpage(url, display_id)
-        video_id = self._search_regex(
-            r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid')
-        result = self._download_json(
-            'https://feeds.video.aetnd.com/api/v2/history/videos',
-            video_id, query={'filter[id]': video_id})['results'][0]
-        title = result['title']
-        info = self._extract_aen_smil(result['publicUrl'], video_id)
-        info.update({
-            'title': title,
-            'description': result.get('description'),
-            'duration': int_or_none(result.get('duration')),
-            'timestamp': int_or_none(result.get('added'), 1000),
-        })
-        return info
+        player_url = self._search_regex(
+            r'<phoenix-iframe[^>]+src="(%s)' % HistoryPlayerIE._VALID_URL,
+            webpage, 'player URL')
+        return self.url_result(player_url, HistoryPlayerIE.ie_key())
diff --git a/youtube_dlc/extractor/amcnetworks.py b/youtube_dlc/extractor/amcnetworks.py

index 6fb3d6c53fe8e25382b16f93257b9f98b05594cd..b8027bbca16f0ff4a38806f4f588a98609232025 100644 (file)
--- a/youtube_dlc/extractor/amcnetworks.py
+++ b/youtube_dlc/extractor/amcnetworks.py
@@ -1,6 +1,8 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import re
+
  from .theplatform import ThePlatformIE
  from ..utils import (
      int_or_none,
@@ -11,25 +13,22 @@
  
  
  class AMCNetworksIE(ThePlatformIE):
-    _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)'
      _TESTS = [{
-        'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1',
-        'md5': '',
+        'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631',
          'info_dict': {
-            'id': 's3MX01Nl4vPH',
+            'id': '4Lq1dzOnZGt0',
              'ext': 'mp4',
-            'title': 'Maron - Season 4 - Step 1',
-            'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.',
-            'age_limit': 17,
-            'upload_date': '20160505',
-            'timestamp': 1462468831,
+            'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner",
+            'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.",
+            'upload_date': '20201120',
+            'timestamp': 1605904350,
              'uploader': 'AMCN',
          },
          'params': {
              # m3u8 download
              'skip_download': True,
          },
-        'skip': 'Requires TV provider accounts',
      }, {
          'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
          'only_matching': True,
@@ -55,32 +54,34 @@ class AMCNetworksIE(ThePlatformIE):
          'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1',
          'only_matching': True,
      }]
+    _REQUESTOR_ID_MAP = {
+        'amc': 'AMC',
+        'bbcamerica': 'BBCA',
+        'ifc': 'IFC',
+        'sundancetv': 'SUNDANCE',
+        'wetv': 'WETV',
+    }
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
+        site, display_id = re.match(self._VALID_URL, url).groups()
+        requestor_id = self._REQUESTOR_ID_MAP[site]
+        properties = self._download_json(
+            'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id),
+            display_id)['data']['properties']
          query = {
              'mbr': 'true',
              'manifest': 'm3u',
          }
-        media_url = self._search_regex(
-            r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)',
-            webpage, 'media url')
-        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
-            r'link\.theplatform\.com/s/([^?]+)',
-            media_url, 'theplatform_path'), display_id)
+        tp_path = 'M_UwQC/media/' + properties['videoPid']
+        media_url = 'https://link.theplatform.com/s/' + tp_path
+        theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id)
          info = self._parse_theplatform_metadata(theplatform_metadata)
          video_id = theplatform_metadata['pid']
          title = theplatform_metadata['title']
          rating = try_get(
              theplatform_metadata, lambda x: x['ratings'][0]['rating'])
-        auth_required = self._search_regex(
-            r'window\.authRequired\s*=\s*(true|false);',
-            webpage, 'auth required')
-        if auth_required == 'true':
-            requestor_id = self._search_regex(
-                r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)',
-                webpage, 'requestor id')
+        video_category = properties.get('videoCategory')
+        if video_category and video_category.endswith('-Auth'):
              resource = self._get_mvpd_resource(
                  requestor_id, title, video_id, rating)
              query['auth'] = self._extract_mvpd_auth(
diff --git a/youtube_dlc/extractor/americastestkitchen.py b/youtube_dlc/extractor/americastestkitchen.py

index 9c9d77ae107e0b822b46368d89445f21e9e830a6..e20f00fc3efabf7ed5c12892aeeafa986bfb803e 100644 (file)
--- a/youtube_dlc/extractor/americastestkitchen.py
+++ b/youtube_dlc/extractor/americastestkitchen.py
@@ -1,33 +1,33 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import re
+
  from .common import InfoExtractor
  from ..utils import (
      clean_html,
-    int_or_none,
-    js_to_json,
      try_get,
      unified_strdate,
  )
  
  
  class AmericasTestKitchenIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)'
      _TESTS = [{
          'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
          'md5': 'b861c3e365ac38ad319cfd509c30577f',
          'info_dict': {
              'id': '5b400b9ee338f922cb06450c',
-            'title': 'Weeknight Japanese Suppers',
+            'title': 'Japanese Suppers',
              'ext': 'mp4',
-            'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8',
+            'description': 'md5:64e606bfee910627efc4b5f050de92b3',
              'thumbnail': r're:^https?://',
              'timestamp': 1523664000,
              'upload_date': '20180414',
-            'release_date': '20180414',
+            'release_date': '20180410',
              'series': "America's Test Kitchen",
              'season_number': 18,
-            'episode': 'Weeknight Japanese Suppers',
+            'episode': 'Japanese Suppers',
              'episode_number': 15,
          },
          'params': {
@@ -36,47 +36,31 @@ class AmericasTestKitchenIE(InfoExtractor):
      }, {
          'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
          'only_matching': True,
+    }, {
+        'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
+        resource_type, video_id = re.match(self._VALID_URL, url).groups()
+        is_episode = resource_type == 'episode'
+        if is_episode:
+            resource_type = 'episodes'
  
-        video_data = self._parse_json(
-            self._search_regex(
-                r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>',
-                webpage, 'initial context'),
-            video_id, js_to_json)
-
-        ep_data = try_get(
-            video_data,
-            (lambda x: x['episodeDetail']['content']['data'],
-             lambda x: x['videoDetail']['content']['data']), dict)
-        ep_meta = ep_data.get('full_video', {})
-
-        zype_id = ep_data.get('zype_id') or ep_meta['zype_id']
-
-        title = ep_data.get('title') or ep_meta.get('title')
-        description = clean_html(ep_meta.get('episode_description') or ep_data.get(
-            'description') or ep_meta.get('description'))
-        thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url'])
-        release_date = unified_strdate(ep_data.get('aired_at'))
-
-        season_number = int_or_none(ep_meta.get('season_number'))
-        episode = ep_meta.get('title')
-        episode_number = int_or_none(ep_meta.get('episode_number'))
+        resource = self._download_json(
+            'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id)
+        video = resource['video'] if is_episode else resource
+        episode = resource if is_episode else resource.get('episode') or {}
  
          return {
              '_type': 'url_transparent',
-            'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id,
+            'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'],
              'ie_key': 'Zype',
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'release_date': release_date,
-            'series': "America's Test Kitchen",
-            'season_number': season_number,
-            'episode': episode,
-            'episode_number': episode_number,
+            'description': clean_html(video.get('description')),
+            'release_date': unified_strdate(video.get('publishDate')),
+            'series': try_get(episode, lambda x: x['show']['title']),
+            'episode': episode.get('title'),
          }
diff --git a/youtube_dlc/extractor/anvato.py b/youtube_dlc/extractor/anvato.py

index 84e841035afb16a44d7225456845c1162d712fa3..b7398563b35ab55282a5f6f2b1254a23e131f1f1 100644 (file)
--- a/youtube_dlc/extractor/anvato.py
+++ b/youtube_dlc/extractor/anvato.py
@@ -116,7 +116,76 @@ class AnvatoIE(InfoExtractor):
          'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
          'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
          'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
-        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
+        'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z',
+        'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B',
+        'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj',
+        'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l',
+        '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P',
+        'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A',
+        'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V',
+        'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z',
+        'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9',
+        'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e',
+        'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D',
+        'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d',
+        'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ',
+        'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V',
+        'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe',
+        'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP',
+        '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV',
+        'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v',
+        'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q',
+        'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV',
+        'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r',
+        'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR',
+        'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0',
+        'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl',
+        'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923',
+        '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P',
+        '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa',
+        '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V',
+        'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5',
+        'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ',
+        'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye',
+        'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o',
+        'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e',
+        'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z',
+        'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R',
+        '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29',
+        'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q',
+        'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp',
+        'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze',
+        '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ',
+        '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa',
+        '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ',
+        'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL',
+        'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo',
+        'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV',
+        '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa',
+        'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y',
+        '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P',
+        'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO',
+        'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr',
+        '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy',
+        'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn',
+        '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj',
+        'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29',
+        'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V',
+        'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5',
+        'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy',
+        'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e',
+        '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y',
+        'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0',
+        'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy',
+        'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV',
+        'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K',
+        'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23',
+        'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR',
+        'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R',
+        'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ',
+        'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L',
+        'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR',
      }
  
      _MCP_TO_ACCESS_KEY_TABLE = {
@@ -189,19 +258,17 @@ def _get_video_json(self, access_key, video_id):
  
          video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
          anvrid = md5_text(time.time() * 1000 * random.random())[:30]
-        payload = {
-            'api': {
-                'anvrid': anvrid,
-                'anvstk': md5_text('%s|%s|%d|%s' % (
-                    access_key, anvrid, server_time,
-                    self._ANVACK_TABLE.get(access_key, self._API_KEY))),
-                'anvts': server_time,
-            },
+        api = {
+            'anvrid': anvrid,
+            'anvts': server_time,
          }
+        api['anvstk'] = md5_text('%s|%s|%d|%s' % (
+            access_key, anvrid, server_time,
+            self._ANVACK_TABLE.get(access_key, self._API_KEY)))
  
          return self._download_json(
              video_data_url, video_id, transform_source=strip_jsonp,
-            data=json.dumps(payload).encode('utf-8'))
+            data=json.dumps({'api': api}).encode('utf-8'))
  
      def _get_anvato_videos(self, access_key, video_id):
          video_data = self._get_video_json(access_key, video_id)
@@ -259,7 +326,7 @@ def _get_anvato_videos(self, access_key, video_id):
              'description': video_data.get('def_description'),
              'tags': video_data.get('def_tags', '').split(','),
              'categories': video_data.get('categories'),
-            'thumbnail': video_data.get('thumbnail'),
+            'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'),
              'timestamp': int_or_none(video_data.get(
                  'ts_published') or video_data.get('ts_added')),
              'uploader': video_data.get('mcp_id'),
diff --git a/youtube_dlc/extractor/anvato_token_generator/__init__.py b/youtube_dlc/extractor/anvato_token_generator/__init__.py

new file mode 100644 (file)

index 0000000..6e223db
--- /dev/null
+++ b/youtube_dlc/extractor/anvato_token_generator/__init__.py
@@ -0,0 +1,7 @@
+from __future__ import unicode_literals
+
+from .nfl import NFLTokenGenerator
+
+__all__ = [
+    'NFLTokenGenerator',
+]
diff --git a/youtube_dlc/extractor/anvato_token_generator/common.py b/youtube_dlc/extractor/anvato_token_generator/common.py

new file mode 100644 (file)

index 0000000..b959a90
--- /dev/null
+++ b/youtube_dlc/extractor/anvato_token_generator/common.py
@@ -0,0 +1,6 @@
+from __future__ import unicode_literals
+
+
+class TokenGenerator:
+    def generate(self, anvack, mcp_id):
+        raise NotImplementedError('This method must be implemented by subclasses')
diff --git a/youtube_dlc/extractor/anvato_token_generator/nfl.py b/youtube_dlc/extractor/anvato_token_generator/nfl.py

new file mode 100644 (file)

index 0000000..97a2b24
--- /dev/null
+++ b/youtube_dlc/extractor/anvato_token_generator/nfl.py
@@ -0,0 +1,30 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import TokenGenerator
+
+
+class NFLTokenGenerator(TokenGenerator):
+    _AUTHORIZATION = None
+
+    def generate(ie, anvack, mcp_id):
+        if not NFLTokenGenerator._AUTHORIZATION:
+            reroute = ie._download_json(
+                'https://api.nfl.com/v1/reroute', mcp_id,
+                data=b'grant_type=client_credentials',
+                headers={'X-Domain-Id': 100})
+            NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token'])
+        return ie._download_json(
+            'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({
+                'query': '''{
+  viewer {
+    mediaToken(anvack: "%s", id: %s) {
+      token
+    }
+  }
+}''' % (anvack, mcp_id),
+            }).encode(), headers={
+                'Authorization': NFLTokenGenerator._AUTHORIZATION,
+                'Content-Type': 'application/json',
+            })['data']['viewer']['mediaToken']['token']
diff --git a/youtube_dlc/extractor/aparat.py b/youtube_dlc/extractor/aparat.py

index 883dcee7aa4cae953fff16dbca8cbc5fbf07e64e..a9527e78550297e416baa9afa11ee01c767daf3c 100644 (file)
--- a/youtube_dlc/extractor/aparat.py
+++ b/youtube_dlc/extractor/aparat.py
@@ -3,6 +3,7 @@
  
  from .common import InfoExtractor
  from ..utils import (
+    get_element_by_id,
      int_or_none,
      merge_dicts,
      mimetype2ext,
@@ -39,23 +40,15 @@ def _real_extract(self, url):
          webpage = self._download_webpage(url, video_id, fatal=False)
  
          if not webpage:
-            # Note: There is an easier-to-parse configuration at
-            # http://www.aparat.com/video/video/config/videohash/%video_id
-            # but the URL in there does not work
              webpage = self._download_webpage(
                  'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
                  video_id)
  
-        options = self._parse_json(
-            self._search_regex(
-                r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)',
-                webpage, 'options', group='value'),
-            video_id)
-
-        player = options['plugins']['sabaPlayerPlugin']
+        options = self._parse_json(self._search_regex(
+            r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id)
  
          formats = []
-        for sources in player['multiSRC']:
+        for sources in (options.get('multiSRC') or []):
              for item in sources:
                  if not isinstance(item, dict):
                      continue
@@ -85,11 +78,12 @@ def _real_extract(self, url):
          info = self._search_json_ld(webpage, video_id, default={})
  
          if not info.get('title'):
-            info['title'] = player['title']
+            info['title'] = get_element_by_id('videoTitle', webpage) or \
+                self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True)
  
          return merge_dicts(info, {
              'id': video_id,
              'thumbnail': url_or_none(options.get('poster')),
-            'duration': int_or_none(player.get('duration')),
+            'duration': int_or_none(options.get('duration')),
              'formats': formats,
          })
diff --git a/youtube_dlc/extractor/arcpublishing.py b/youtube_dlc/extractor/arcpublishing.py

new file mode 100644 (file)

index 0000000..ca6a6c4
--- /dev/null
+++ b/youtube_dlc/extractor/arcpublishing.py
@@ -0,0 +1,174 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+)
+
+
+class ArcPublishingIE(InfoExtractor):
+    _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
+    _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX
+    _TESTS = [{
+        # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
+        'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
+        'only_matching': True,
+    }, {
+        # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
+        'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
+        'only_matching': True,
+    }, {
+        # https://www.actionnewsjax.com/video/live-stream/
+        'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
+        'only_matching': True,
+    }, {
+        # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
+        'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
+        'only_matching': True,
+    }, {
+        # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
+        'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
+        'only_matching': True,
+    }, {
+        # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
+        'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
+        'only_matching': True,
+    }, {
+        # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
+        'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
+        'only_matching': True,
+    }, {
+        # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
+        'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
+        'only_matching': True,
+    }, {
+        # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
+        'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
+        'only_matching': True,
+    }, {
+        # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
+        'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
+        'only_matching': True,
+    }, {
+        # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
+        'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
+        'only_matching': True,
+    }, {
+        # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
+        'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
+        'only_matching': True,
+    }]
+    _POWA_DEFAULTS = [
+        (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
+        ([
+            'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
+            'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
+            'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
+        ], 'video-api-cdn.%s.arcpublishing.com/api'),
+    ]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        entries = []
+        # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
+        for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
+            powa = extract_attributes(powa_el) or {}
+            org = powa.get('data-org')
+            uuid = powa.get('data-uuid')
+            if org and uuid:
+                entries.append('arcpublishing:%s:%s' % (org, uuid))
+        return entries
+
+    def _real_extract(self, url):
+        org, uuid = re.match(self._VALID_URL, url).groups()
+        for orgs, tmpl in self._POWA_DEFAULTS:
+            if org in orgs:
+                base_api_tmpl = tmpl
+                break
+        else:
+            base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
+        if org == 'wapo':
+            org = 'washpost'
+        video = self._download_json(
+            'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
+            uuid, query={'uuid': uuid})[0]
+        title = video['headlines']['basic']
+        is_live = video.get('status') == 'live'
+
+        urls = []
+        formats = []
+        for s in video.get('streams', []):
+            s_url = s.get('url')
+            if not s_url or s_url in urls:
+                continue
+            urls.append(s_url)
+            stream_type = s.get('stream_type')
+            if stream_type == 'smil':
+                smil_formats = self._extract_smil_formats(
+                    s_url, uuid, fatal=False)
+                for f in smil_formats:
+                    if f['url'].endswith('/cfx/st'):
+                        f['app'] = 'cfx/st'
+                        if not f['play_path'].startswith('mp4:'):
+                            f['play_path'] = 'mp4:' + f['play_path']
+                        if isinstance(f['tbr'], float):
+                            f['vbr'] = f['tbr'] * 1000
+                            del f['tbr']
+                            f['format_id'] = 'rtmp-%d' % f['vbr']
+                formats.extend(smil_formats)
+            elif stream_type in ('ts', 'hls'):
+                m3u8_formats = self._extract_m3u8_formats(
+                    s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
+                    m3u8_id='hls', fatal=False)
+                if all([f.get('acodec') == 'none' for f in m3u8_formats]):
+                    continue
+                for f in m3u8_formats:
+                    if f.get('acodec') == 'none':
+                        f['preference'] = -40
+                    elif f.get('vcodec') == 'none':
+                        f['preference'] = -50
+                    height = f.get('height')
+                    if not height:
+                        continue
+                    vbr = self._search_regex(
+                        r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
+                    if vbr:
+                        f['vbr'] = int(vbr)
+                formats.extend(m3u8_formats)
+            else:
+                vbr = int_or_none(s.get('bitrate'))
+                formats.append({
+                    'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type,
+                    'vbr': vbr,
+                    'width': int_or_none(s.get('width')),
+                    'height': int_or_none(s.get('height')),
+                    'filesize': int_or_none(s.get('filesize')),
+                    'url': s_url,
+                    'preference': -1,
+                })
+        self._sort_formats(
+            formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id'))
+
+        subtitles = {}
+        for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
+            subtitle_url = subtitle.get('url')
+            if subtitle_url:
+                subtitles.setdefault('en', []).append({'url': subtitle_url})
+
+        return {
+            'id': uuid,
+            'title': self._live_title(title) if is_live else title,
+            'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
+            'description': try_get(video, lambda x: x['subheadlines']['basic']),
+            'formats': formats,
+            'duration': int_or_none(video.get('duration'), 100),
+            'timestamp': parse_iso8601(video.get('created_date')),
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
diff --git a/youtube_dlc/extractor/arkena.py b/youtube_dlc/extractor/arkena.py

index 854f5876757885c9966fe11ddc56b8553e54c561..fd46b1c7711d51daa80a0c08e2b8540fe8b54508 100644 (file)
--- a/youtube_dlc/extractor/arkena.py
+++ b/youtube_dlc/extractor/arkena.py
@@ -6,13 +6,11 @@
  from .common import InfoExtractor
  from ..compat import compat_urlparse
  from ..utils import (
-    determine_ext,
      ExtractorError,
      float_or_none,
      int_or_none,
-    mimetype2ext,
      parse_iso8601,
-    strip_jsonp,
+    try_get,
  )
  
  
@@ -20,22 +18,27 @@ class ArkenaIE(InfoExtractor):
      _VALID_URL = r'''(?x)
                          https?://
                              (?:
-                                video\.arkena\.com/play2/embed/player\?|
+                                video\.(?:arkena|qbrick)\.com/play2/embed/player\?|
                                  play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
                              )
                          '''
      _TESTS = [{
-        'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
-        'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
+        'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310',
+        'md5': '97f117754e5f3c020f5f26da4a44ebaf',
          'info_dict': {
-            'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
+            'id': 'd8ab4607-00090107-aab86310',
              'ext': 'mp4',
-            'title': 'Big Buck Bunny',
-            'description': 'Royalty free test video',
-            'timestamp': 1432816365,
-            'upload_date': '20150528',
-            'is_live': False,
+            'title': 'EM_HT20_117_roslund_v2.mp4',
+            'timestamp': 1608285912,
+            'upload_date': '20201218',
+            'duration': 1429.162667,
+            'subtitles': {
+                'sv': 'count:3',
+            },
          },
+    }, {
+        'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
+        'only_matching': True,
      }, {
          'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893',
          'only_matching': True,
@@ -72,62 +75,89 @@ def _real_extract(self, url):
              if not video_id or not account_id:
                  raise ExtractorError('Invalid URL', expected=True)
  
-        playlist = self._download_json(
-            'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_'
-            % (video_id, account_id),
-            video_id, transform_source=strip_jsonp)['Playlist'][0]
-
-        media_info = playlist['MediaInfo']
-        title = media_info['Title']
-        media_files = playlist['MediaFiles']
+        media = self._download_json(
+            'https://video.qbrick.com/api/v1/public/accounts/%s/medias/%s' % (account_id, video_id),
+            video_id, query={
+                # https://video.qbrick.com/docs/api/examples/library-api.html
+                'fields': 'asset/resources/*/renditions/*(height,id,language,links/*(href,mimeType),type,size,videos/*(audios/*(codec,sampleRate),bitrate,codec,duration,height,width),width),created,metadata/*(title,description),tags',
+            })
+        metadata = media.get('metadata') or {}
+        title = metadata['title']
  
-        is_live = False
+        duration = None
          formats = []
-        for kind_case, kind_formats in media_files.items():
-            kind = kind_case.lower()
-            for f in kind_formats:
-                f_url = f.get('Url')
-                if not f_url:
-                    continue
-                is_live = f.get('Live') == 'true'
-                exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
-                if kind == 'm3u8' or 'm3u8' in exts:
-                    formats.extend(self._extract_m3u8_formats(
-                        f_url, video_id, 'mp4', 'm3u8_native',
-                        m3u8_id=kind, fatal=False, live=is_live))
-                elif kind == 'flash' or 'f4m' in exts:
-                    formats.extend(self._extract_f4m_formats(
-                        f_url, video_id, f4m_id=kind, fatal=False))
-                elif kind == 'dash' or 'mpd' in exts:
-                    formats.extend(self._extract_mpd_formats(
-                        f_url, video_id, mpd_id=kind, fatal=False))
-                elif kind == 'silverlight':
-                    # TODO: process when ism is supported (see
-                    # https://github.com/ytdl-org/youtube-dl/issues/8118)
-                    continue
-                else:
-                    tbr = float_or_none(f.get('Bitrate'), 1000)
-                    formats.append({
-                        'url': f_url,
-                        'format_id': '%s-%d' % (kind, tbr) if tbr else kind,
-                        'tbr': tbr,
-                    })
+        thumbnails = []
+        subtitles = {}
+        for resource in media['asset']['resources']:
+            for rendition in (resource.get('renditions') or []):
+                rendition_type = rendition.get('type')
+                for i, link in enumerate(rendition.get('links') or []):
+                    href = link.get('href')
+                    if not href:
+                        continue
+                    if rendition_type == 'image':
+                        thumbnails.append({
+                            'filesize': int_or_none(rendition.get('size')),
+                            'height': int_or_none(rendition.get('height')),
+                            'id': rendition.get('id'),
+                            'url': href,
+                            'width': int_or_none(rendition.get('width')),
+                        })
+                    elif rendition_type == 'subtitle':
+                        subtitles.setdefault(rendition.get('language') or 'en', []).append({
+                            'url': href,
+                        })
+                    elif rendition_type == 'video':
+                        f = {
+                            'filesize': int_or_none(rendition.get('size')),
+                            'format_id': rendition.get('id'),
+                            'url': href,
+                        }
+                        video = try_get(rendition, lambda x: x['videos'][i], dict)
+                        if video:
+                            if not duration:
+                                duration = float_or_none(video.get('duration'))
+                            f.update({
+                                'height': int_or_none(video.get('height')),
+                                'tbr': int_or_none(video.get('bitrate'), 1000),
+                                'vcodec': video.get('codec'),
+                                'width': int_or_none(video.get('width')),
+                            })
+                            audio = try_get(video, lambda x: x['audios'][0], dict)
+                            if audio:
+                                f.update({
+                                    'acodec': audio.get('codec'),
+                                    'asr': int_or_none(audio.get('sampleRate')),
+                                })
+                        formats.append(f)
+                    elif rendition_type == 'index':
+                        mime_type = link.get('mimeType')
+                        if mime_type == 'application/smil+xml':
+                            formats.extend(self._extract_smil_formats(
+                                href, video_id, fatal=False))
+                        elif mime_type == 'application/x-mpegURL':
+                            formats.extend(self._extract_m3u8_formats(
+                                href, video_id, 'mp4', 'm3u8_native',
+                                m3u8_id='hls', fatal=False))
+                        elif mime_type == 'application/hds+xml':
+                            formats.extend(self._extract_f4m_formats(
+                                href, video_id, f4m_id='hds', fatal=False))
+                        elif mime_type == 'application/dash+xml':
+                            formats.extend(self._extract_f4m_formats(
+                                href, video_id, f4m_id='hds', fatal=False))
+                        elif mime_type == 'application/vnd.ms-sstr+xml':
+                            formats.extend(self._extract_ism_formats(
+                                href, video_id, ism_id='mss', fatal=False))
          self._sort_formats(formats)
  
-        description = media_info.get('Description')
-        video_id = media_info.get('VideoId') or video_id
-        timestamp = parse_iso8601(media_info.get('PublishDate'))
-        thumbnails = [{
-            'url': thumbnail['Url'],
-            'width': int_or_none(thumbnail.get('Size')),
-        } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')]
-
          return {
              'id': video_id,
              'title': title,
-            'description': description,
-            'timestamp': timestamp,
-            'is_live': is_live,
+            'description': metadata.get('description'),
+            'timestamp': parse_iso8601(media.get('created')),
              'thumbnails': thumbnails,
+            'subtitles': subtitles,
+            'duration': duration,
+            'tags': media.get('tags'),
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/asiancrush.py b/youtube_dlc/extractor/asiancrush.py

index 0348e680c6456946fb582ffadc90fd3fb9fa2e11..66ce7c6869306e0b33e99d575e01105fa809c032 100644 (file)
--- a/youtube_dlc/extractor/asiancrush.py
+++ b/youtube_dlc/extractor/asiancrush.py
@@ -1,27 +1,91 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import functools
  import re
  
  from .common import InfoExtractor
  from .kaltura import KalturaIE
-from ..utils import extract_attributes
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    OnDemandPagedList,
+    parse_age_limit,
+    strip_or_none,
+    try_get,
+)
+
+
+class AsianCrushBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))'
+    _KALTURA_KEYS = [
+        'video_url', 'progressive_url', 'download_url', 'thumbnail_url',
+        'widescreen_thumbnail_url', 'screencap_widescreen',
+    ]
+    _API_SUFFIX = {'retrocrush.tv': '-ott'}
+
+    def _call_api(self, host, endpoint, video_id, query, resource):
+        return self._download_json(
+            'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id,
+            'Downloading %s JSON metadata' % resource, query=query,
+            headers=self.geo_verification_headers())['objects']
+
+    def _download_object_data(self, host, object_id, resource):
+        return self._call_api(
+            host, 'search', object_id, {'id': object_id}, resource)[0]
+
+    def _get_object_description(self, obj):
+        return strip_or_none(obj.get('long_description') or obj.get('short_description'))
+
+    def _parse_video_data(self, video):
+        title = video['name']
+
+        entry_id, partner_id = [None] * 2
+        for k in self._KALTURA_KEYS:
+            k_url = video.get(k)
+            if k_url:
+                mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url)
+                if mobj:
+                    partner_id, entry_id = mobj.groups()
+                    break
+
+        meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or []
+        categories = list(filter(None, [c.get('name') for c in meta_categories]))
+
+        show_info = video.get('show_info') or {}
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+            'ie_key': KalturaIE.ie_key(),
+            'id': entry_id,
+            'title': title,
+            'description': self._get_object_description(video),
+            'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')),
+            'categories': categories,
+            'series': show_info.get('show_name'),
+            'season_number': int_or_none(show_info.get('season_num')),
+            'season_id': show_info.get('season_id'),
+            'episode_number': int_or_none(show_info.get('episode_num')),
+        }
  
  
-class AsianCrushIE(InfoExtractor):
-    _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))'
-    _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE
+class AsianCrushIE(AsianCrushBaseIE):
+    _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE
      _TESTS = [{
-        'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/',
+        'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt',
          'md5': 'c3b740e48d0ba002a42c0b72857beae6',
          'info_dict': {
              'id': '1_y4tmjm5r',
              'ext': 'mp4',
              'title': 'Women Who Flirt',
-            'description': 'md5:7e986615808bcfb11756eb503a751487',
+            'description': 'md5:b65c7e0ae03a85585476a62a186f924c',
              'timestamp': 1496936429,
              'upload_date': '20170608',
              'uploader_id': 'craig@crifkin.com',
+            'age_limit': 13,
+            'categories': 'count:5',
+            'duration': 5812,
          },
      }, {
          'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/',
@@ -41,67 +105,35 @@ class AsianCrushIE(InfoExtractor):
      }, {
          'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/',
          'only_matching': True,
+    }, {
+        'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        host = mobj.group('host')
-        video_id = mobj.group('id')
-
-        webpage = self._download_webpage(url, video_id)
+        host, video_id = re.match(self._VALID_URL, url).groups()
  
-        entry_id, partner_id, title = [None] * 3
-
-        vars = self._parse_json(
-            self._search_regex(
+        if host == 'cocoro.tv':
+            webpage = self._download_webpage(url, video_id)
+            embed_vars = self._parse_json(self._search_regex(
                  r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars',
-                default='{}'), video_id, fatal=False)
-        if vars:
-            entry_id = vars.get('entry_id')
-            partner_id = vars.get('partner_id')
-            title = vars.get('vid_label')
-
-        if not entry_id:
-            entry_id = self._search_regex(
-                r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id')
+                default='{}'), video_id, fatal=False) or {}
+            video_id = embed_vars.get('entry_id') or video_id
  
-        player = self._download_webpage(
-            'https://api.%s/embeddedVideoPlayer' % host, video_id,
-            query={'id': entry_id})
+        video = self._download_object_data(host, video_id, 'video')
+        return self._parse_video_data(video)
  
-        kaltura_id = self._search_regex(
-            r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player,
-            'kaltura id', group='id')
  
-        if not partner_id:
-            partner_id = self._search_regex(
-                r'/p(?:artner_id)?/(\d+)', player, 'partner id',
-                default='513551')
-
-        description = self._html_search_regex(
-            r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>',
-            webpage, 'description', fatal=False)
-
-        return {
-            '_type': 'url_transparent',
-            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
-            'ie_key': KalturaIE.ie_key(),
-            'id': video_id,
-            'title': title,
-            'description': description,
-        }
-
-
-class AsianCrushPlaylistIE(InfoExtractor):
-    _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE
+class AsianCrushPlaylistIE(AsianCrushBaseIE):
+    _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE
      _TESTS = [{
-        'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/',
+        'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai',
          'info_dict': {
-            'id': '12481',
-            'title': 'Scholar Who Walks the Night',
-            'description': 'md5:7addd7c5132a09fd4741152d96cce886',
+            'id': '6447',
+            'title': 'Fruity Samurai',
+            'description': 'md5:7535174487e4a202d3872a7fc8f2f154',
          },
-        'playlist_count': 20,
+        'playlist_count': 13,
      }, {
          'url': 'https://www.yuyutv.com/series/013920s/peep-show/',
          'only_matching': True,
@@ -111,35 +143,58 @@ class AsianCrushPlaylistIE(InfoExtractor):
      }, {
          'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/',
          'only_matching': True,
+    }, {
+        'url': 'https://www.retrocrush.tv/series/012355s/true-tears',
+        'only_matching': True,
      }]
+    _PAGE_SIZE = 1000000000
+
+    def _fetch_page(self, domain, parent_id, page):
+        videos = self._call_api(
+            domain, 'getreferencedobjects', parent_id, {
+                'max': self._PAGE_SIZE,
+                'object_type': 'video',
+                'parent_id': parent_id,
+                'start': page * self._PAGE_SIZE,
+            }, 'page %d' % (page + 1))
+        for video in videos:
+            yield self._parse_video_data(video)
  
      def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, playlist_id)
-
-        entries = []
-
-        for mobj in re.finditer(
-                r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
-                webpage):
-            attrs = extract_attributes(mobj.group(0))
-            if attrs.get('class') == 'clearfix':
-                entries.append(self.url_result(
-                    mobj.group('url'), ie=AsianCrushIE.ie_key()))
-
-        title = self._html_search_regex(
-            r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
-            'title', default=None) or self._og_search_title(
-            webpage, default=None) or self._html_search_meta(
-            'twitter:title', webpage, 'title',
-            default=None) or self._search_regex(
-            r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
-        if title:
-            title = re.sub(r'\s*\|\s*.+?$', '', title)
-
-        description = self._og_search_description(
-            webpage, default=None) or self._html_search_meta(
-            'twitter:description', webpage, 'description', fatal=False)
+        host, playlist_id = re.match(self._VALID_URL, url).groups()
+
+        if host == 'cocoro.tv':
+            webpage = self._download_webpage(url, playlist_id)
+
+            entries = []
+
+            for mobj in re.finditer(
+                    r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
+                    webpage):
+                attrs = extract_attributes(mobj.group(0))
+                if attrs.get('class') == 'clearfix':
+                    entries.append(self.url_result(
+                        mobj.group('url'), ie=AsianCrushIE.ie_key()))
+
+            title = self._html_search_regex(
+                r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
+                'title', default=None) or self._og_search_title(
+                webpage, default=None) or self._html_search_meta(
+                'twitter:title', webpage, 'title',
+                default=None) or self._search_regex(
+                r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+            if title:
+                title = re.sub(r'\s*\|\s*.+?$', '', title)
+
+            description = self._og_search_description(
+                webpage, default=None) or self._html_search_meta(
+                'twitter:description', webpage, 'description', fatal=False)
+        else:
+            show = self._download_object_data(host, playlist_id, 'show')
+            title = show.get('name')
+            description = self._get_object_description(show)
+            entries = OnDemandPagedList(
+                functools.partial(self._fetch_page, host, playlist_id),
+                self._PAGE_SIZE)
  
          return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py

index 54cbcdc8e5205075e463323b940f35c403086e08..b4daee54ea2b8fc245201d34f10beee43fbe5a83 100644 (file)
--- a/youtube_dlc/extractor/bbc.py
+++ b/youtube_dlc/extractor/bbc.py
@@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor):
      _LOGIN_URL = 'https://account.bbc.com/signin'
      _NETRC_MACHINE = 'bbc'
  
-    _MEDIASELECTOR_URLS = [
+    _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
+    _MEDIA_SETS = [
          # Provides HQ HLS streams with even better quality that pc mediaset but fails
          # with geolocation in some cases when it's even not geo restricted at all (e.g.
          # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
-        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
-        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+        'iptv-all',
+        'pc',
      ]
  
-    _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
      _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  
-    _NAMESPACES = (
-        _MEDIASELECTION_NS,
-        _EMP_PLAYLIST_NS,
-    )
-
      _TESTS = [
          {
              'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
@@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor):
              'only_matching': True,
          }]
  
-    _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
-
      def _login(self):
          username, password = self._get_login_info()
          if username is None:
@@ -307,22 +300,14 @@ def _extract_asx_playlist(self, connection, programme_id):
      def _extract_items(self, playlist):
          return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
  
-    def _findall_ns(self, element, xpath):
-        elements = []
-        for ns in self._NAMESPACES:
-            elements.extend(element.findall(xpath % ns))
-        return elements
-
      def _extract_medias(self, media_selection):
-        error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
-        if error is None:
-            media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
-        if error is not None:
-            raise BBCCoUkIE.MediaSelectionError(error.get('id'))
-        return self._findall_ns(media_selection, './{%s}media')
+        error = media_selection.get('result')
+        if error:
+            raise BBCCoUkIE.MediaSelectionError(error)
+        return media_selection.get('media') or []
  
      def _extract_connections(self, media):
-        return self._findall_ns(media, './{%s}connection')
+        return media.get('connection') or []
  
      def _get_subtitles(self, media, programme_id):
          subtitles = {}
@@ -334,13 +319,13 @@ def _get_subtitles(self, media, programme_id):
                  cc_url, programme_id, 'Downloading captions', fatal=False)
              if not isinstance(captions, compat_etree_Element):
                  continue
-            lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
-            subtitles[lang] = [
+            subtitles['en'] = [
                  {
                      'url': connection.get('href'),
                      'ext': 'ttml',
                  },
              ]
+            break
          return subtitles
  
      def _raise_extractor_error(self, media_selection_error):
@@ -350,10 +335,10 @@ def _raise_extractor_error(self, media_selection_error):
  
      def _download_media_selector(self, programme_id):
          last_exception = None
-        for mediaselector_url in self._MEDIASELECTOR_URLS:
+        for media_set in self._MEDIA_SETS:
              try:
                  return self._download_media_selector_url(
-                    mediaselector_url % programme_id, programme_id)
+                    self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
              except BBCCoUkIE.MediaSelectionError as e:
                  if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
                      last_exception = e
@@ -362,8 +347,8 @@ def _download_media_selector(self, programme_id):
          self._raise_extractor_error(last_exception)
  
      def _download_media_selector_url(self, url, programme_id=None):
-        media_selection = self._download_xml(
-            url, programme_id, 'Downloading media selection XML',
+        media_selection = self._download_json(
+            url, programme_id, 'Downloading media selection JSON',
              expected_status=(403, 404))
          return self._process_media_selector(media_selection, programme_id)
  
@@ -377,7 +362,6 @@ def _process_media_selector(self, media_selection, programme_id):
              if kind in ('video', 'audio'):
                  bitrate = int_or_none(media.get('bitrate'))
                  encoding = media.get('encoding')
-                service = media.get('service')
                  width = int_or_none(media.get('width'))
                  height = int_or_none(media.get('height'))
                  file_size = int_or_none(media.get('media_file_size'))
@@ -392,8 +376,6 @@ def _process_media_selector(self, media_selection, programme_id):
                      supplier = connection.get('supplier')
                      transfer_format = connection.get('transferFormat')
                      format_id = supplier or conn_kind or protocol
-                    if service:
-                        format_id = '%s_%s' % (service, format_id)
                      # ASX playlist
                      if supplier == 'asx':
                          for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
@@ -408,20 +390,11 @@ def _process_media_selector(self, media_selection, programme_id):
                          formats.extend(self._extract_m3u8_formats(
                              href, programme_id, ext='mp4', entry_protocol='m3u8_native',
                              m3u8_id=format_id, fatal=False))
-                        if re.search(self._USP_RE, href):
-                            usp_formats = self._extract_m3u8_formats(
-                                re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
-                                programme_id, ext='mp4', entry_protocol='m3u8_native',
-                                m3u8_id=format_id, fatal=False)
-                            for f in usp_formats:
-                                if f.get('height') and f['height'] > 720:
-                                    continue
-                                formats.append(f)
                      elif transfer_format == 'hds':
                          formats.extend(self._extract_f4m_formats(
                              href, programme_id, f4m_id=format_id, fatal=False))
                      else:
-                        if not service and not supplier and bitrate:
+                        if not supplier and bitrate:
                              format_id += '-%d' % bitrate
                          fmt = {
                              'format_id': format_id,
@@ -554,7 +527,7 @@ def _real_extract(self, url):
          webpage = self._download_webpage(url, group_id, 'Downloading video page')
  
          error = self._search_regex(
-            r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
+            r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
              webpage, 'error', default=None)
          if error:
              raise ExtractorError(error, expected=True)
@@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE):
      IE_DESC = 'BBC'
      _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
  
-    _MEDIASELECTOR_URLS = [
-        # Provides HQ HLS streams but fails with geolocation in some cases when it's
-        # even not geo restricted at all
-        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
-        # Provides more formats, namely direct mp4 links, but fails on some videos with
-        # notukerror for non UK (?) users (e.g.
-        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
-        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
-        # Provides fewer formats, but works everywhere for everybody (hopefully)
-        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+    _MEDIA_SETS = [
+        'mobile-tablet-main',
+        'pc',
      ]
  
      _TESTS = [{
diff --git a/youtube_dlc/extractor/beampro.py b/youtube_dlc/extractor/beampro.py

deleted file mode 100644 (file)

index 86abdae..0000000
--- a/youtube_dlc/extractor/beampro.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    compat_str,
-    float_or_none,
-    int_or_none,
-    parse_iso8601,
-    try_get,
-    urljoin,
-)
-
-
-class BeamProBaseIE(InfoExtractor):
-    _API_BASE = 'https://mixer.com/api/v1'
-    _RATINGS = {'family': 0, 'teen': 13, '18+': 18}
-
-    def _extract_channel_info(self, chan):
-        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id'])
-        return {
-            'uploader': chan.get('token') or try_get(
-                chan, lambda x: x['user']['username'], compat_str),
-            'uploader_id': compat_str(user_id) if user_id else None,
-            'age_limit': self._RATINGS.get(chan.get('audience')),
-        }
-
-
-class BeamProLiveIE(BeamProBaseIE):
-    IE_NAME = 'Mixer:live'
-    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)'
-    _TEST = {
-        'url': 'http://mixer.com/niterhayven',
-        'info_dict': {
-            'id': '261562',
-            'ext': 'mp4',
-            'title': 'Introducing The Witcher 3 //  The Grind Starts Now!',
-            'description': 'md5:0b161ac080f15fe05d18a07adb44a74d',
-            'thumbnail': r're:https://.*\.jpg$',
-            'timestamp': 1483477281,
-            'upload_date': '20170103',
-            'uploader': 'niterhayven',
-            'uploader_id': '373396',
-            'age_limit': 18,
-            'is_live': True,
-            'view_count': int,
-        },
-        'skip': 'niterhayven is offline',
-        'params': {
-            'skip_download': True,
-        },
-    }
-
-    _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE
-
-    @classmethod
-    def suitable(cls, url):
-        return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url)
-
-    def _real_extract(self, url):
-        channel_name = self._match_id(url)
-
-        chan = self._download_json(
-            '%s/channels/%s' % (self._API_BASE, channel_name), channel_name)
-
-        if chan.get('online') is False:
-            raise ExtractorError(
-                '{0} is offline'.format(channel_name), expected=True)
-
-        channel_id = chan['id']
-
-        def manifest_url(kind):
-            return self._MANIFEST_URL_TEMPLATE % (channel_id, kind)
-
-        formats = self._extract_m3u8_formats(
-            manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls',
-            fatal=False)
-        formats.extend(self._extract_smil_formats(
-            manifest_url('smil'), channel_name, fatal=False))
-        self._sort_formats(formats)
-
-        info = {
-            'id': compat_str(chan.get('id') or channel_name),
-            'title': self._live_title(chan.get('name') or channel_name),
-            'description': clean_html(chan.get('description')),
-            'thumbnail': try_get(
-                chan, lambda x: x['thumbnail']['url'], compat_str),
-            'timestamp': parse_iso8601(chan.get('updatedAt')),
-            'is_live': True,
-            'view_count': int_or_none(chan.get('viewersTotal')),
-            'formats': formats,
-        }
-        info.update(self._extract_channel_info(chan))
-
-        return info
-
-
-class BeamProVodIE(BeamProBaseIE):
-    IE_NAME = 'Mixer:vod'
-    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)'
-    _TESTS = [{
-        'url': 'https://mixer.com/willow8714?vod=2259830',
-        'md5': 'b2431e6e8347dc92ebafb565d368b76b',
-        'info_dict': {
-            'id': '2259830',
-            'ext': 'mp4',
-            'title': 'willow8714\'s Channel',
-            'duration': 6828.15,
-            'thumbnail': r're:https://.*source\.png$',
-            'timestamp': 1494046474,
-            'upload_date': '20170506',
-            'uploader': 'willow8714',
-            'uploader_id': '6085379',
-            'age_limit': 13,
-            'view_count': int,
-        },
-        'params': {
-            'skip_download': True,
-        },
-    }, {
-        'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw',
-        'only_matching': True,
-    }, {
-        'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig',
-        'only_matching': True,
-    }]
-
-    @staticmethod
-    def _extract_format(vod, vod_type):
-        if not vod.get('baseUrl'):
-            return []
-
-        if vod_type == 'hls':
-            filename, protocol = 'manifest.m3u8', 'm3u8_native'
-        elif vod_type == 'raw':
-            filename, protocol = 'source.mp4', 'https'
-        else:
-            assert False
-
-        data = vod.get('data') if isinstance(vod.get('data'), dict) else {}
-
-        format_id = [vod_type]
-        if isinstance(data.get('Height'), compat_str):
-            format_id.append('%sp' % data['Height'])
-
-        return [{
-            'url': urljoin(vod['baseUrl'], filename),
-            'format_id': '-'.join(format_id),
-            'ext': 'mp4',
-            'protocol': protocol,
-            'width': int_or_none(data.get('Width')),
-            'height': int_or_none(data.get('Height')),
-            'fps': int_or_none(data.get('Fps')),
-            'tbr': int_or_none(data.get('Bitrate'), 1000),
-        }]
-
-    def _real_extract(self, url):
-        vod_id = self._match_id(url)
-
-        vod_info = self._download_json(
-            '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id)
-
-        state = vod_info.get('state')
-        if state != 'AVAILABLE':
-            raise ExtractorError(
-                'VOD %s is not available (state: %s)' % (vod_id, state),
-                expected=True)
-
-        formats = []
-        thumbnail_url = None
-
-        for vod in vod_info['vods']:
-            vod_type = vod.get('format')
-            if vod_type in ('hls', 'raw'):
-                formats.extend(self._extract_format(vod, vod_type))
-            elif vod_type == 'thumbnail':
-                thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png')
-
-        self._sort_formats(formats)
-
-        info = {
-            'id': vod_id,
-            'title': vod_info.get('name') or vod_id,
-            'duration': float_or_none(vod_info.get('duration')),
-            'thumbnail': thumbnail_url,
-            'timestamp': parse_iso8601(vod_info.get('createdAt')),
-            'view_count': int_or_none(vod_info.get('viewsTotal')),
-            'formats': formats,
-        }
-        info.update(self._extract_channel_info(vod_info.get('channel') or {}))
-
-        return info
diff --git a/youtube_dlc/extractor/bongacams.py b/youtube_dlc/extractor/bongacams.py

new file mode 100644 (file)

index 0000000..180542f
--- /dev/null
+++ b/youtube_dlc/extractor/bongacams.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class BongaCamsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://de.bongacams.com/azumi-8',
+        'only_matching': True,
+    }, {
+        'url': 'https://cn.bongacams.com/azumi-8',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        channel_id = mobj.group('id')
+
+        amf = self._download_json(
+            'https://%s/tools/amf.php' % host, channel_id,
+            data=urlencode_postdata((
+                ('method', 'getRoomData'),
+                ('args[]', channel_id),
+                ('args[]', 'false'),
+            )), headers={'X-Requested-With': 'XMLHttpRequest'})
+
+        server_url = amf['localData']['videoServerUrl']
+
+        uploader_id = try_get(
+            amf, lambda x: x['performerData']['username'], compat_str) or channel_id
+        uploader = try_get(
+            amf, lambda x: x['performerData']['displayName'], compat_str)
+        like_count = int_or_none(try_get(
+            amf, lambda x: x['performerData']['loversCount']))
+
+        formats = self._extract_m3u8_formats(
+            '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id),
+            channel_id, 'mp4', m3u8_id='hls', live=True)
+        self._sort_formats(formats)
+
+        return {
+            'id': channel_id,
+            'title': self._live_title(uploader or uploader_id),
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'like_count': like_count,
+            'age_limit': 18,
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py

index c6ca939ddd4cd323cfcea456502dc57575999253..6022076acaf24727fe436b7fb50a16f1a9029259 100644 (file)
--- a/youtube_dlc/extractor/brightcove.py
+++ b/youtube_dlc/extractor/brightcove.py
@@ -28,6 +28,7 @@
      parse_iso8601,
      smuggle_url,
      str_or_none,
+    try_get,
      unescapeHTML,
      unsmuggle_url,
      UnsupportedError,
@@ -470,18 +471,18 @@ def _extract_urls(ie, webpage):
      def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
          title = json_data['name'].strip()
  
+        num_drm_sources = 0
          formats = []
-        sources_num = len(json_data.get('sources'))
-        key_systems_present = 0
-        for source in json_data.get('sources', []):
+        sources = json_data.get('sources') or []
+        for source in sources:
              container = source.get('container')
              ext = mimetype2ext(source.get('type'))
              src = source.get('src')
-            # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html
-            if source.get('key_systems'):
-                key_systems_present += 1
+            # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
+            if container == 'WVM' or source.get('key_systems'):
+                num_drm_sources += 1
                  continue
-            elif ext == 'ism' or container == 'WVM':
+            elif ext == 'ism':
                  continue
              elif ext == 'm3u8' or container == 'M2TS':
                  if not src:
@@ -539,23 +540,14 @@ def build_format_id(kind):
                      })
                  formats.append(f)
  
-        if sources_num == key_systems_present:
-            raise ExtractorError('This video is DRM protected', expected=True)
-
          if not formats:
-            # for sonyliv.com DRM protected videos
-            s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
-            if s3_source_url:
-                formats.append({
-                    'url': s3_source_url,
-                    'format_id': 'source',
-                })
-
-        errors = json_data.get('errors')
-        if not formats and errors:
-            error = errors[0]
-            raise ExtractorError(
-                error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+            errors = json_data.get('errors')
+            if errors:
+                error = errors[0]
+                raise ExtractorError(
+                    error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+            if sources and num_drm_sources == len(sources):
+                raise ExtractorError('This video is DRM protected.', expected=True)
  
          self._sort_formats(formats)
  
@@ -609,24 +601,27 @@ def _real_extract(self, url):
          store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
  
          def extract_policy_key():
-            webpage = self._download_webpage(
-                'http://players.brightcove.net/%s/%s_%s/index.min.js'
-                % (account_id, player_id, embed), video_id)
-
-            policy_key = None
+            base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed)
+            config = self._download_json(
+                base_url + 'config.json', video_id, fatal=False) or {}
+            policy_key = try_get(
+                config, lambda x: x['video_cloud']['policy_key'])
+            if not policy_key:
+                webpage = self._download_webpage(
+                    base_url + 'index.min.js', video_id)
  
-            catalog = self._search_regex(
-                r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
-            if catalog:
-                catalog = self._parse_json(
-                    js_to_json(catalog), video_id, fatal=False)
+                catalog = self._search_regex(
+                    r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
                  if catalog:
-                    policy_key = catalog.get('policyKey')
-
-            if not policy_key:
-                policy_key = self._search_regex(
-                    r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
-                    webpage, 'policy key', group='pk')
+                    catalog = self._parse_json(
+                        js_to_json(catalog), video_id, fatal=False)
+                    if catalog:
+                        policy_key = catalog.get('policyKey')
+
+                if not policy_key:
+                    policy_key = self._search_regex(
+                        r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
+                        webpage, 'policy key', group='pk')
  
              store_pk(policy_key)
              return policy_key
diff --git a/youtube_dlc/extractor/cbslocal.py b/youtube_dlc/extractor/cbslocal.py

index 90852a9ef9b7a6707ebcd1ade6cccb9ff9bbde5a..3b7e1a8b9f655dc7fcde1e663b538e02175f055b 100644 (file)
--- a/youtube_dlc/extractor/cbslocal.py
+++ b/youtube_dlc/extractor/cbslocal.py
@@ -11,7 +11,47 @@
  
  
  class CBSLocalIE(AnvatoIE):
-    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
+    _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/'
+    _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
+        'info_dict': {
+            'id': '3580809',
+            'ext': 'mp4',
+            'title': 'A Very Blue Anniversary',
+            'description': 'CBS2’s Cindy Hsu has more.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': int,
+            'upload_date': r're:^\d{8}$',
+            'uploader': 'CBS',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\WCBSTV',
+                'Syndication\\AOL',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\Yahoo',
+                'Content\\News',
+                'Content\\News\\Local News',
+            ],
+            'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mcp_id = self._match_id(url)
+        return self.url_result(
+            'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id)
+
+
+class CBSLocalArticleIE(AnvatoIE):
+    _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
  
      _TESTS = [{
          # Anvato backend
@@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE):
              # m3u8 download
              'skip_download': True,
          },
-    }, {
-        'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
-        'info_dict': {
-            'id': '3580809',
-            'ext': 'mp4',
-            'title': 'A Very Blue Anniversary',
-            'description': 'CBS2’s Cindy Hsu has more.',
-            'thumbnail': 're:^https?://.*',
-            'timestamp': int,
-            'upload_date': r're:^\d{8}$',
-            'uploader': 'CBS',
-            'subtitles': {
-                'en': 'mincount:5',
-            },
-            'categories': [
-                'Stations\\Spoken Word\\WCBSTV',
-                'Syndication\\AOL',
-                'Syndication\\MSN',
-                'Syndication\\NDN',
-                'Syndication\\Yahoo',
-                'Content\\News',
-                'Content\\News\\Local News',
-            ],
-            'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
-        },
      }]
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/cnn.py b/youtube_dlc/extractor/cnn.py

index 774b7105580f69dedac4318c82aee2ab62c27381..2d950fa05c47acdd9f9039e1239656af331cc280 100644 (file)
--- a/youtube_dlc/extractor/cnn.py
+++ b/youtube_dlc/extractor/cnn.py
@@ -96,7 +96,10 @@ def _real_extract(self, url):
              config['data_src'] % path, page_title, {
                  'default': {
                      'media_src': config['media_src'],
-                }
+                },
+                'f4m': {
+                    'host': 'cnn-vh.akamaihd.net',
+                },
              })
  
  
diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py

index 1ffe37bde6c13ffdf8b7b5377c08dd4230025b75..9dfa9a60dbea67558a63651d0766567cd9deeee1 100644 (file)
--- a/youtube_dlc/extractor/common.py
+++ b/youtube_dlc/extractor/common.py
@@ -337,8 +337,8 @@ class InfoExtractor(object):
      object, each element of which is a valid dictionary by this specification.
  
      Additionally, playlists can have "id", "title", "description", "uploader",
-    "uploader_id", "uploader_url" attributes with the same semantics as videos
-    (see above).
+    "uploader_id", "uploader_url", "duration" attributes with the same semantics
+    as videos (see above).
  
  
      _type "multi_video" indicates that there are multiple videos that
@@ -1238,8 +1238,16 @@ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
              'ViewAction': 'view',
          }
  
+        def extract_interaction_type(e):
+            interaction_type = e.get('interactionType')
+            if isinstance(interaction_type, dict):
+                interaction_type = interaction_type.get('@type')
+            return str_or_none(interaction_type)
+
          def extract_interaction_statistic(e):
              interaction_statistic = e.get('interactionStatistic')
+            if isinstance(interaction_statistic, dict):
+                interaction_statistic = [interaction_statistic]
              if not isinstance(interaction_statistic, list):
                  return
              for is_e in interaction_statistic:
@@ -1247,8 +1255,8 @@ def extract_interaction_statistic(e):
                      continue
                  if is_e.get('@type') != 'InteractionCounter':
                      continue
-                interaction_type = is_e.get('interactionType')
-                if not isinstance(interaction_type, compat_str):
+                interaction_type = extract_interaction_type(is_e)
+                if not interaction_type:
                      continue
                  # For interaction count some sites provide string instead of
                  # an integer (as per spec) with non digit characters (e.g. ",")
@@ -2704,16 +2712,18 @@ def _media_formats(src, cur_media_type, type_info={}):
          # amp-video and amp-audio are very similar to their HTML5 counterparts
          # so we wll include them right here (see
          # https://www.ampproject.org/docs/reference/components/amp-video)
-        media_tags = [(media_tag, media_type, '')
-                      for media_tag, media_type
-                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
+        # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
+        _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
+        media_tags = [(media_tag, media_tag_name, media_type, '')
+                      for media_tag, media_tag_name, media_type
+                      in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
          media_tags.extend(re.findall(
              # We only allow video|audio followed by a whitespace or '>'.
              # Allowing more characters may end up in significant slow down (see
              # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
              # http://www.porntrex.com/maps/videositemap.xml).
-            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
-        for media_tag, media_type, media_content in media_tags:
+            r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
+        for media_tag, _, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                  'subtitles': {},
@@ -2786,6 +2796,13 @@ def _media_formats(src, cur_media_type, type_info={}):
          return entries
  
      def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+        signed = 'hdnea=' in manifest_url
+        if not signed:
+            # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
+            manifest_url = re.sub(
+                r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
+                '', manifest_url).strip('?')
+
          formats = []
  
          hdcore_sign = 'hdcore=3.7.0'
@@ -2805,33 +2822,32 @@ def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
          hls_host = hosts.get('hls')
          if hls_host:
              m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
-        formats.extend(self._extract_m3u8_formats(
+        m3u8_formats = self._extract_m3u8_formats(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
-            m3u8_id='hls', fatal=False))
+            m3u8_id='hls', fatal=False)
+        formats.extend(m3u8_formats)
  
          http_host = hosts.get('http')
-        if http_host and 'hdnea=' not in manifest_url:
-            REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
+        if http_host and m3u8_formats and not signed:
+            REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
              qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
              qualities_length = len(qualities)
-            if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
+            if len(m3u8_formats) in (qualities_length, qualities_length + 1):
                  i = 0
-                http_formats = []
-                for f in formats:
-                    if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none':
+                for f in m3u8_formats:
+                    if f['vcodec'] != 'none':
                          for protocol in ('http', 'https'):
                              http_f = f.copy()
                              del http_f['manifest_url']
                              http_url = re.sub(
-                                REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
+                                REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
                              http_f.update({
                                  'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
                                  'url': http_url,
                                  'protocol': protocol,
                              })
-                            http_formats.append(http_f)
+                            formats.append(http_f)
                          i += 1
-                formats.extend(http_formats)
  
          return formats
  
diff --git a/youtube_dlc/extractor/cspan.py b/youtube_dlc/extractor/cspan.py

index 67d6df4b0eb285f588b1352eae4a46e0c4b220fd..766942146ff60458061bd35c048b6dd399e253f1 100644 (file)
--- a/youtube_dlc/extractor/cspan.py
+++ b/youtube_dlc/extractor/cspan.py
@@ -10,6 +10,8 @@
      find_xpath_attr,
      get_element_by_class,
      int_or_none,
+    js_to_json,
+    merge_dicts,
      smuggle_url,
      unescapeHTML,
  )
@@ -98,6 +100,26 @@ def _real_extract(self, url):
                      bc_attr['data-bcid'])
                  return self.url_result(smuggle_url(bc_url, {'source_url': url}))
  
+        def add_referer(formats):
+            for f in formats:
+                f.setdefault('http_headers', {})['Referer'] = url
+
+        # As of 01.12.2020 this path looks to cover all cases making the rest
+        # of the code unnecessary
+        jwsetup = self._parse_json(
+            self._search_regex(
+                r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup',
+                default='{}'),
+            video_id, transform_source=js_to_json, fatal=False)
+        if jwsetup:
+            info = self._parse_jwplayer_data(
+                jwsetup, video_id, require_title=False, m3u8_id='hls',
+                base_url=url)
+            add_referer(info['formats'])
+            ld_info = self._search_json_ld(webpage, video_id, default={})
+            return merge_dicts(info, ld_info)
+
+        # Obsolete
          # We first look for clipid, because clipprog always appears before
          patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
          results = list(filter(None, (re.search(p, webpage) for p in patterns)))
@@ -165,6 +187,7 @@ def get_text_attr(d, attr):
                  formats = self._extract_m3u8_formats(
                      path, video_id, 'mp4', entry_protocol='m3u8_native',
                      m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]
+            add_referer(formats)
              self._sort_formats(formats)
              entries.append({
                  'id': '%s_%d' % (video_id, partnum + 1),
diff --git a/youtube_dlc/extractor/ctv.py b/youtube_dlc/extractor/ctv.py

new file mode 100644 (file)

index 0000000..756bcc2
--- /dev/null
+++ b/youtube_dlc/extractor/ctv.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ctv\.ca/(?P<id>(?:show|movie)s/[^/]+/[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.ctv.ca/shows/your-morning/wednesday-december-23-2020-s5e88',
+        'info_dict': {
+            'id': '2102249',
+            'ext': 'flv',
+            'title': 'Wednesday, December 23, 2020',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Your Morning delivers original perspectives and unique insights into the headlines of the day.',
+            'timestamp': 1608732000,
+            'upload_date': '20201223',
+            'series': 'Your Morning',
+            'season': '2020-2021',
+            'season_number': 5,
+            'episode_number': 88,
+            'tags': ['Your Morning'],
+            'categories': ['Talk Show'],
+            'duration': 7467.126,
+        },
+    }, {
+        'url': 'https://www.ctv.ca/movies/adam-sandlers-eight-crazy-nights/adam-sandlers-eight-crazy-nights',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        content = self._download_json(
+            'https://www.ctv.ca/space-graphql/graphql', display_id, query={
+                'query': '''{
+  resolvedPath(path: "/%s") {
+    lastSegment {
+      content {
+        ... on AxisContent {
+          axisId
+          videoPlayerDestCode
+        }
+      }
+    }
+  }
+}''' % display_id,
+            })['data']['resolvedPath']['lastSegment']['content']
+        video_id = content['axisId']
+        return self.url_result(
+            '9c9media:%s:%s' % (content['videoPlayerDestCode'], video_id),
+            'NineCNineMedia', video_id)
diff --git a/youtube_dlc/extractor/drtv.py b/youtube_dlc/extractor/drtv.py

index 390e79f8cfae9cc0ec4b0f045ed8bac88bd6c523..c0036adb619857c9a8b01398503f6eb4f44b0d84 100644 (file)
--- a/youtube_dlc/extractor/drtv.py
+++ b/youtube_dlc/extractor/drtv.py
@@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor):
                      https?://
                          (?:
                              (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*|
-                            (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/
+                            (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/
                          )
                          (?P<id>[\da-z_-]+)
                      '''
@@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor):
      }, {
          'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
          'only_matching': True,
+    }, {
+        'url': 'https://www.dr.dk/drtv/program/jagten_220924',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/eporner.py b/youtube_dlc/extractor/eporner.py

index fe42821c731c711e8f0974fd4ce48f5c9aee8e8f..bfecd3a418805761fc3516cf7afb1e14f9d16283 100644 (file)
--- a/youtube_dlc/extractor/eporner.py
+++ b/youtube_dlc/extractor/eporner.py
@@ -16,7 +16,7 @@
  
  
  class EpornerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?'
+    _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?'
      _TESTS = [{
          'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
          'md5': '39d486f046212d8e1b911c52ab4691f8',
@@ -43,7 +43,10 @@ class EpornerIE(InfoExtractor):
          'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0',
          'only_matching': True,
      }, {
-        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0',
+        'url': 'http://www.eporner.com/embed/3YRUtzMcWn0',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/',
          'only_matching': True,
      }]
  
@@ -57,7 +60,7 @@ def _real_extract(self, url):
          video_id = self._match_id(urlh.geturl())
  
          hash = self._search_regex(
-            r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash')
+            r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash')
  
          title = self._og_search_title(webpage, default=None) or self._html_search_regex(
              r'<title>(.+?) - EPORNER', webpage, 'title')
@@ -115,8 +118,8 @@ def calc_hash(s):
          duration = parse_duration(self._html_search_meta(
              'duration', webpage, default=None))
          view_count = str_to_int(self._search_regex(
-            r'id="cinemaviews">\s*([0-9,]+)\s*<small>views',
-            webpage, 'view count', fatal=False))
+            r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)',
+            webpage, 'view count', default=None))
  
          return merge_dicts(json_ld, {
              'id': video_id,
diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py

index 5ad9d27173a2900d1b05f65395fcc54df533289b..200cf13953e642d6eee9d0f8ec5d7694a9e7e6ec 100644 (file)
--- a/youtube_dlc/extractor/extractors.py
+++ b/youtube_dlc/extractor/extractors.py
@@ -30,7 +30,11 @@
  from .adultswim import AdultSwimIE
  from .aenetworks import (
      AENetworksIE,
+    AENetworksCollectionIE,
+    AENetworksShowIE,
      HistoryTopicIE,
+    HistoryPlayerIE,
+    BiographyIE,
  )
  from .afreecatv import AfreecaTVIE
  from .airmozilla import AirMozillaIE
@@ -56,6 +60,7 @@
      AppleTrailersSectionIE,
  )
  from .archiveorg import ArchiveOrgIE
+from .arcpublishing import ArcPublishingIE
  from .arkena import ArkenaIE
  from .ard import (
      ARDBetaMediathekIE,
@@ -93,10 +98,6 @@
      BBCCoUkPlaylistIE,
      BBCIE,
  )
-from .beampro import (
-    BeamProLiveIE,
-    BeamProVodIE,
-)
  from .beeg import BeegIE
  from .behindkink import BehindKinkIE
  from .bellmedia import BellMediaIE
@@ -129,6 +130,7 @@
  from .blinkx import BlinkxIE
  from .bloomberg import BloombergIE
  from .bokecc import BokeCCIE
+from .bongacams import BongaCamsIE
  from .bostonglobe import BostonGlobeIE
  from .box import BoxIE
  from .bpb import BpbIE
@@ -173,7 +175,10 @@
      CBCOlympicsIE,
  )
  from .cbs import CBSIE
-from .cbslocal import CBSLocalIE
+from .cbslocal import (
+    CBSLocalIE,
+    CBSLocalArticleIE,
+)
  from .cbsinteractive import CBSInteractiveIE
  from .cbsnews import (
      CBSNewsEmbedIE,
@@ -251,6 +256,7 @@
  )
  from .cspan import CSpanIE
  from .ctsnews import CtsNewsIE
+from .ctv import CTVIE
  from .ctvnews import CTVNewsIE
  from .cultureunplugged import CultureUnpluggedIE
  from .curiositystream import (
@@ -345,7 +351,6 @@
  )
  from .esri import EsriVideoIE
  from .europa import EuropaIE
-from .everyonesmixtape import EveryonesMixtapeIE
  from .expotv import ExpoTVIE
  from .expressen import ExpressenIE
  from .extremetube import ExtremeTubeIE
@@ -409,10 +414,10 @@
      FrontendMastersLessonIE,
      FrontendMastersCourseIE
  )
+from .fujitv import FujiTVFODPlus7IE
  from .funimation import FunimationIE
  from .funk import FunkIE
  from .fusion import FusionIE
-from .fxnetworks import FXNetworksIE
  from .gaia import GaiaIE
  from .gameinformer import GameInformerIE
  from .gamespot import GameSpotIE
@@ -523,7 +528,6 @@
  from .jwplatform import JWPlatformIE
  from .kakao import KakaoIE
  from .kaltura import KalturaIE
-from .kanalplay import KanalPlayIE
  from .kankan import KankanIE
  from .karaoketv import KaraoketvIE
  from .karrierevideos import KarriereVideosIE
@@ -552,7 +556,10 @@
      EHFTVIE,
      ITTFIE,
  )
-from .lbry import LBRYIE
+from .lbry import (
+    LBRYIE,
+    LBRYChannelIE,
+)
  from .lci import LCIIE
  from .lcp import (
      LcpPlayIE,
@@ -703,9 +710,15 @@
      NaverIE,
      NaverLiveIE,
  )
-from .nba import NBAIE
+from .nba import (
+    NBAWatchEmbedIE,
+    NBAWatchIE,
+    NBAWatchCollectionIE,
+    NBAEmbedIE,
+    NBAIE,
+    NBAChannelIE,
+)
  from .nbc import (
-    CSNNEIE,
      NBCIE,
      NBCNewsIE,
      NBCOlympicsIE,
@@ -748,8 +761,14 @@
      NexxIE,
      NexxEmbedIE,
  )
-from .nfl import NFLIE
-from .nhk import NhkVodIE
+from .nfl import (
+    NFLIE,
+    NFLArticleIE,
+)
+from .nhk import (
+    NhkVodIE,
+    NhkVodProgramIE,
+)
  from .nhl import NHLIE
  from .nick import (
      NickIE,
@@ -766,7 +785,6 @@
  from .nitter import NitterIE
  from .njpwworld import NJPWWorldIE
  from .nobelprize import NobelPrizeIE
-from .noco import NocoIE
  from .nonktube import NonkTubeIE
  from .noovo import NoovoIE
  from .normalboots import NormalbootsIE
@@ -799,6 +817,7 @@
      NRKSkoleIE,
      NRKTVIE,
      NRKTVDirekteIE,
+    NRKRadioPodkastIE,
      NRKTVEpisodeIE,
      NRKTVEpisodesIE,
      NRKTVSeasonIE,
@@ -1070,16 +1089,11 @@
  from .sky import (
      SkyNewsIE,
      SkySportsIE,
+    SkySportsNewsIE,
  )
  from .slideshare import SlideshareIE
  from .slideslive import SlidesLiveIE
  from .slutload import SlutloadIE
-from .smotri import (
-    SmotriIE,
-    SmotriCommunityIE,
-    SmotriUserIE,
-    SmotriBroadcastIE,
-)
  from .snotr import SnotrIE
  from .sohu import SohuIE
  from .sonyliv import SonyLIVIE
@@ -1162,7 +1176,6 @@
      TagesschauIE,
  )
  from .tass import TassIE
-from .tastytrade import TastyTradeIE
  from .tbs import TBSIE
  from .tdslifeway import TDSLifewayIE
  from .teachable import (
@@ -1189,6 +1202,7 @@
      TeleQuebecSquatIE,
      TeleQuebecEmissionIE,
      TeleQuebecLiveIE,
+    TeleQuebecVideoIE,
  )
  from .teletask import TeleTaskIE
  from .telewebion import TelewebionIE
@@ -1220,7 +1234,10 @@
      EMPFlixIE,
      MovieFapIE,
  )
-from .toggle import ToggleIE
+from .toggle import (
+    ToggleIE,
+    MeWatchIE,
+)
  from .tonline import TOnlineIE
  from .toongoggles import ToonGogglesIE
  from .toutv import TouTvIE
@@ -1253,7 +1270,14 @@
  from .tv2hu import TV2HuIE
  from .tv4 import TV4IE
  from .tv5mondeplus import TV5MondePlusIE
-from .tva import TVAIE
+from .tv5unis import (
+    TV5UnisVideoIE,
+    TV5UnisIE,
+)
+from .tva import (
+    TVAIE,
+    QubIE,
+)
  from .tvanouvelles import (
      TVANouvellesIE,
      TVANouvellesArticleIE,
@@ -1262,6 +1286,7 @@
      TVCIE,
      TVCArticleIE,
  )
+from .tver import TVerIE
  from .tvigle import TvigleIE
  from .tvland import TVLandIE
  from .tvn24 import TVN24IE
@@ -1440,7 +1465,10 @@
  from .medialaan import MedialaanIE
  from .vube import VubeIE
  from .vuclip import VuClipIE
-from .vvvvid import VVVVIDIE
+from .vvvvid import (
+    VVVVIDIE,
+    VVVVIDShowIE,
+)
  from .vyborymos import VyboryMosIE
  from .vzaar import VzaarIE
  from .wakanim import WakanimIE
@@ -1471,7 +1499,10 @@
      WeiboMobileIE
  )
  from .weiqitv import WeiqiTVIE
-from .wistia import WistiaIE
+from .wistia import (
+    WistiaIE,
+    WistiaPlaylistIE,
+)
  from .worldstarhiphop import WorldStarHipHopIE
  from .wsj import (
      WSJIE,
@@ -1515,6 +1546,8 @@
      YandexMusicTrackIE,
      YandexMusicAlbumIE,
      YandexMusicPlaylistIE,
+    YandexMusicArtistTracksIE,
+    YandexMusicArtistAlbumsIE,
  )
  from .yandexvideo import YandexVideoIE
  from .yapfiles import YapFilesIE
@@ -1547,11 +1580,11 @@
      YoutubeSubscriptionsIE,
      YoutubeTruncatedIDIE,
      YoutubeTruncatedURLIE,
+    YoutubeYtBeIE,
      YoutubeYtUserIE,
      YoutubeWatchLaterIE,
  )
  from .zapiks import ZapiksIE
-from .zaq1 import Zaq1IE
  from .zattoo import (
      BBVTVIE,
      EinsUndEinsTVIE,
diff --git a/youtube_dlc/extractor/facebook.py b/youtube_dlc/extractor/facebook.py

index 610d6674592384922f9df7af4da5958592ce56bd..cb34c59f544fe39e08814c947a9c66418302dd63 100644 (file)
--- a/youtube_dlc/extractor/facebook.py
+++ b/youtube_dlc/extractor/facebook.py
@@ -1,6 +1,7 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import json
  import re
  import socket
  
@@ -8,6 +9,7 @@
  from ..compat import (
      compat_etree_fromstring,
      compat_http_client,
+    compat_str,
      compat_urllib_error,
      compat_urllib_parse_unquote,
      compat_urllib_parse_unquote_plus,
@@ -16,14 +18,17 @@
      clean_html,
      error_to_compat_str,
      ExtractorError,
+    float_or_none,
      get_element_by_id,
      int_or_none,
      js_to_json,
      limit_length,
      parse_count,
+    qualities,
      sanitized_Request,
      try_get,
      urlencode_postdata,
+    urljoin,
  )
  
  
@@ -39,11 +44,13 @@ class FacebookIE(InfoExtractor):
                                  photo\.php|
                                  video\.php|
                                  video/embed|
-                                story\.php
+                                story\.php|
+                                watch(?:/live)?/?
                              )\?(?:.*?)(?:v|video_id|story_fbid)=|
                              [^/]+/videos/(?:[^/]+/)?|
                              [^/]+/posts/|
-                            groups/[^/]+/permalink/
+                            groups/[^/]+/permalink/|
+                            watchparty/
                          )|
                      facebook:
                  )
@@ -54,8 +61,6 @@ class FacebookIE(InfoExtractor):
      _NETRC_MACHINE = 'facebook'
      IE_NAME = 'facebook'
  
-    _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
-
      _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
      _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
  
@@ -72,6 +77,7 @@ class FacebookIE(InfoExtractor):
          },
          'skip': 'Requires logging in',
      }, {
+        # data.video
          'url': 'https://www.facebook.com/video.php?v=274175099429670',
          'info_dict': {
              'id': '274175099429670',
@@ -133,6 +139,7 @@ class FacebookIE(InfoExtractor):
          },
      }, {
          # have 1080P, but only up to 720p in swf params
+        # data.video.story.attachments[].media
          'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
          'md5': '9571fae53d4165bbbadb17a94651dcdc',
          'info_dict': {
@@ -147,6 +154,7 @@ class FacebookIE(InfoExtractor):
          },
      }, {
          # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
+        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
          'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
          'info_dict': {
              'id': '1417995061575415',
@@ -174,6 +182,7 @@ class FacebookIE(InfoExtractor):
              'skip_download': True,
          },
      }, {
+        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
          'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
          'info_dict': {
              'id': '1396382447100162',
@@ -193,18 +202,23 @@ class FacebookIE(InfoExtractor):
          'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
          'only_matching': True,
      }, {
+        # data.mediaset.currMedia.edges
          'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
          'only_matching': True,
      }, {
+        # data.video.story.attachments[].media
          'url': 'facebook:544765982287235',
          'only_matching': True,
      }, {
+        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
          'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
          'only_matching': True,
      }, {
+        # data.video.creation_story.attachments[].media
          'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
          'only_matching': True,
      }, {
+        # data.video
          'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
          'only_matching': True,
      }, {
@@ -212,6 +226,7 @@ class FacebookIE(InfoExtractor):
          'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
          'only_matching': True,
      }, {
+        # data.video
          'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
          'info_dict': {
              'id': '359649331226507',
@@ -222,7 +237,64 @@ class FacebookIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+    }, {
+        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
+        'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
+        'info_dict': {
+            'id': '106560053808006',
+        },
+        'playlist_count': 2,
+    }, {
+        # data.video.story.attachments[].media
+        'url': 'https://www.facebook.com/watch/?v=647537299265662',
+        'only_matching': True,
+    }, {
+        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
+        'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
+        'info_dict': {
+            'id': '10157667649866271',
+        },
+        'playlist_count': 3,
+    }, {
+        # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
+        'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
+        'info_dict': {
+            'id': '117576630041613',
+            'ext': 'mp4',
+            # TODO: title can be extracted from video page
+            'title': 'Facebook video #117576630041613',
+            'uploader_id': '189393014416438',
+            'upload_date': '20201123',
+            'timestamp': 1606162592,
+        },
+        'skip': 'Requires logging in',
+    }, {
+        # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
+        'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/',
+        'info_dict': {
+            'id': '211567722618337',
+            'ext': 'mp4',
+            'title': 'Facebook video #211567722618337',
+            'uploader_id': '127875227654254',
+            'upload_date': '20161122',
+            'timestamp': 1479793574,
+        },
+    }, {
+        # data.video.creation_story.attachments[].media
+        'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/watchparty/211641140192478',
+        'info_dict': {
+            'id': '211641140192478',
+        },
+        'playlist_count': 1,
+        'skip': 'Requires logging in',
      }]
+    _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
+    _api_config = {
+        'graphURI': '/api/graphql/'
+    }
  
      @staticmethod
      def _extract_urls(webpage):
@@ -305,23 +377,24 @@ def _login(self):
      def _real_initialize(self):
          self._login()
  
-    def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
-        req = sanitized_Request(url)
-        req.add_header('User-Agent', self._CHROME_USER_AGENT)
-        webpage = self._download_webpage(req, video_id)
+    def _extract_from_url(self, url, video_id):
+        webpage = self._download_webpage(
+            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
  
          video_data = None
  
          def extract_video_data(instances):
+            video_data = []
              for item in instances:
-                if item[1][0] == 'VideoConfig':
+                if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
                      video_item = item[2][0]
                      if video_item.get('video_id'):
-                        return video_item['videoData']
+                        video_data.append(video_item['videoData'])
+            return video_data
  
          server_js_data = self._parse_json(self._search_regex(
-            r'handleServerJS\(({.+})(?:\);|,")', webpage,
-            'server js data', default='{}'), video_id, fatal=False)
+            [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
+            webpage, 'server js data', default='{}'), video_id, fatal=False)
  
          if server_js_data:
              video_data = extract_video_data(server_js_data.get('instances', []))
@@ -331,17 +404,118 @@ def extract_from_jsmods_instances(js_data):
                  return extract_video_data(try_get(
                      js_data, lambda x: x['jsmods']['instances'], list) or [])
  
+        def extract_dash_manifest(video, formats):
+            dash_manifest = video.get('dash_manifest')
+            if dash_manifest:
+                formats.extend(self._parse_mpd_formats(
+                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+
+        def process_formats(formats):
+            # Downloads with browser's User-Agent are rate limited. Working around
+            # with non-browser User-Agent.
+            for f in formats:
+                f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
+
+            self._sort_formats(formats)
+
+        def extract_relay_data(_filter):
+            return self._parse_json(self._search_regex(
+                r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
+                webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
+
+        def extract_relay_prefetched_data(_filter):
+            replay_data = extract_relay_data(_filter)
+            for require in (replay_data.get('require') or []):
+                if require[0] == 'RelayPrefetchedStreamCache':
+                    return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}
+
          if not video_data:
-            server_js_data = self._parse_json(
-                self._search_regex(
-                    r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)',
-                    webpage, 'js data', default='{}'),
-                video_id, transform_source=js_to_json, fatal=False)
+            server_js_data = self._parse_json(self._search_regex([
+                r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
+                r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX
+            ], webpage, 'js data', default='{}'), video_id, js_to_json, False)
              video_data = extract_from_jsmods_instances(server_js_data)
  
          if not video_data:
-            if not fatal_if_no_video:
-                return webpage, False
+            data = extract_relay_prefetched_data(
+                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
+            if data:
+                entries = []
+
+                def parse_graphql_video(video):
+                    formats = []
+                    q = qualities(['sd', 'hd'])
+                    for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
+                        playable_url = video.get('playable_url' + suffix)
+                        if not playable_url:
+                            continue
+                        formats.append({
+                            'format_id': format_id,
+                            'quality': q(format_id),
+                            'url': playable_url,
+                        })
+                    extract_dash_manifest(video, formats)
+                    process_formats(formats)
+                    v_id = video.get('videoId') or video.get('id') or video_id
+                    info = {
+                        'id': v_id,
+                        'formats': formats,
+                        'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
+                        'uploader_id': try_get(video, lambda x: x['owner']['id']),
+                        'timestamp': int_or_none(video.get('publish_time')),
+                        'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
+                    }
+                    description = try_get(video, lambda x: x['savable_description']['text'])
+                    title = video.get('name')
+                    if title:
+                        info.update({
+                            'title': title,
+                            'description': description,
+                        })
+                    else:
+                        info['title'] = description or 'Facebook video #%s' % v_id
+                    entries.append(info)
+
+                def parse_attachment(attachment, key='media'):
+                    media = attachment.get(key) or {}
+                    if media.get('__typename') == 'Video':
+                        return parse_graphql_video(media)
+
+                nodes = data.get('nodes') or []
+                node = data.get('node') or {}
+                if not nodes and node:
+                    nodes.append(node)
+                for node in nodes:
+                    story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
+                    attachments = try_get(story, [
+                        lambda x: x['attached_story']['attachments'],
+                        lambda x: x['attachments']
+                    ], list) or []
+                    for attachment in attachments:
+                        attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
+                        ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
+                        for n in ns:
+                            parse_attachment(n)
+                        parse_attachment(attachment)
+
+                edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
+                for edge in edges:
+                    parse_attachment(edge, key='node')
+
+                video = data.get('video') or {}
+                if video:
+                    attachments = try_get(video, [
+                        lambda x: x['story']['attachments'],
+                        lambda x: x['creation_story']['attachments']
+                    ], list) or []
+                    for attachment in attachments:
+                        parse_attachment(attachment)
+                    if not entries:
+                        parse_graphql_video(video)
+
+                return self.playlist_result(entries, video_id)
+
+        if not video_data:
              m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
              if m_msg is not None:
                  raise ExtractorError(
@@ -350,6 +524,43 @@ def extract_from_jsmods_instances(js_data):
              elif '>You must log in to continue' in webpage:
                  self.raise_login_required()
  
+        if not video_data and '/watchparty/' in url:
+            post_data = {
+                'doc_id': 3731964053542869,
+                'variables': json.dumps({
+                    'livingRoomID': video_id,
+                }),
+            }
+
+            prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
+            if prefetched_data:
+                lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
+                if lsd:
+                    post_data[lsd['name']] = lsd['value']
+
+            relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
+            for define in (relay_data.get('define') or []):
+                if define[0] == 'RelayAPIConfigDefaults':
+                    self._api_config = define[2]
+
+            living_room = self._download_json(
+                urljoin(url, self._api_config['graphURI']), video_id,
+                data=urlencode_postdata(post_data))['data']['living_room']
+
+            entries = []
+            for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
+                video = try_get(edge, lambda x: x['node']['video']) or {}
+                v_id = video.get('id')
+                if not v_id:
+                    continue
+                v_id = compat_str(v_id)
+                entries.append(self.url_result(
+                    self._VIDEO_PAGE_TEMPLATE % v_id,
+                    self.ie_key(), v_id, video.get('name')))
+
+            return self.playlist_result(entries, video_id)
+
+        if not video_data:
              # Video info not in first request, do a secondary request using
              # tahoe player specific URL
              tahoe_data = self._download_webpage(
@@ -379,8 +590,19 @@ def extract_from_jsmods_instances(js_data):
          if not video_data:
              raise ExtractorError('Cannot parse data')
  
-        subtitles = {}
+        if len(video_data) > 1:
+            entries = []
+            for v in video_data:
+                video_url = v[0].get('video_url')
+                if not video_url:
+                    continue
+                entries.append(self.url_result(urljoin(
+                    url, video_url), self.ie_key(), v[0].get('video_id')))
+            return self.playlist_result(entries, video_id)
+        video_data = video_data[0]
+
          formats = []
+        subtitles = {}
          for f in video_data:
              format_id = f['stream_type']
              if f and isinstance(f, dict):
@@ -399,22 +621,14 @@ def extract_from_jsmods_instances(js_data):
                              'url': src,
                              'preference': preference,
                          })
-            dash_manifest = f[0].get('dash_manifest')
-            if dash_manifest:
-                formats.extend(self._parse_mpd_formats(
-                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+            extract_dash_manifest(f[0], formats)
              subtitles_src = f[0].get('subtitles_src')
              if subtitles_src:
                  subtitles.setdefault('en', []).append({'url': subtitles_src})
          if not formats:
              raise ExtractorError('Cannot find video formats')
  
-        # Downloads with browser's User-Agent are rate limited. Working around
-        # with non-browser User-Agent.
-        for f in formats:
-            f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
-
-        self._sort_formats(formats)
+        process_formats(formats)
  
          video_title = self._html_search_regex(
              r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
@@ -454,35 +668,13 @@ def extract_from_jsmods_instances(js_data):
              'subtitles': subtitles,
          }
  
-        return webpage, info_dict
+        return info_dict
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
          real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
-        webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
-
-        if info_dict:
-            return info_dict
-
-        if '/posts/' in url:
-            video_id_json = self._search_regex(
-                r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids',
-                default='')
-            if video_id_json:
-                entries = [
-                    self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
-                    for vid in self._parse_json(video_id_json, video_id)]
-                return self.playlist_result(entries, video_id)
-
-            # Single Video?
-            video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id')
-            return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
-        else:
-            _, info_dict = self._extract_from_url(
-                self._VIDEO_PAGE_TEMPLATE % video_id,
-                video_id, fatal_if_no_video=True)
-            return info_dict
+        return self._extract_from_url(real_url, video_id)
  
  
  class FacebookPluginsVideoIE(InfoExtractor):
diff --git a/youtube_dlc/extractor/fujitv.py b/youtube_dlc/extractor/fujitv.py

new file mode 100644 (file)

index 0000000..39685e0
--- /dev/null
+++ b/youtube_dlc/extractor/fujitv.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FujiTVFODPlus7IE(InfoExtractor):
+    _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)'
+    _BASE_URL = 'http://i.fod.fujitv.co.jp/'
+    _BITRATE_MAP = {
+        300: (320, 180),
+        800: (640, 360),
+        1200: (1280, 720),
+        2000: (1280, 720),
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        formats = self._extract_m3u8_formats(
+            self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id)
+        for f in formats:
+            wh = self._BITRATE_MAP.get(f.get('tbr'))
+            if wh:
+                f.update({
+                    'width': wh[0],
+                    'height': wh[1],
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+            'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id,
+        }
diff --git a/youtube_dlc/extractor/gamespot.py b/youtube_dlc/extractor/gamespot.py

index 4236a5ed8a9bd638c31cfe33f040c555268c937f..7a1beae3cf023a7d7f01a91f3b1b30c17094a57d 100644 (file)
--- a/youtube_dlc/extractor/gamespot.py
+++ b/youtube_dlc/extractor/gamespot.py
@@ -1,16 +1,7 @@
  from __future__ import unicode_literals
  
-import re
-
  from .once import OnceIE
-from ..compat import (
-    compat_urllib_parse_unquote,
-)
-from ..utils import (
-    unescapeHTML,
-    url_basename,
-    dict_get,
-)
+from ..compat import compat_urllib_parse_unquote
  
  
  class GameSpotIE(OnceIE):
@@ -24,17 +15,16 @@ class GameSpotIE(OnceIE):
              'title': 'Arma 3 - Community Guide: SITREP I',
              'description': 'Check out this video where some of the basics of Arma 3 is explained.',
          },
+        'skip': 'manifest URL give HTTP Error 404: Not Found',
      }, {
          'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+        'md5': '173ea87ad762cf5d3bf6163dceb255a6',
          'info_dict': {
              'id': 'gs-2300-6424837',
              'ext': 'mp4',
              'title': 'Now Playing - The Witcher 3: Wild Hunt',
              'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
          },
-        'params': {
-            'skip_download': True,  # m3u8 downloads
-        },
      }, {
          'url': 'https://www.gamespot.com/videos/embed/6439218/',
          'only_matching': True,
@@ -49,90 +39,40 @@ class GameSpotIE(OnceIE):
      def _real_extract(self, url):
          page_id = self._match_id(url)
          webpage = self._download_webpage(url, page_id)
-        data_video_json = self._search_regex(
-            r'data-video=["\'](.*?)["\']', webpage, 'data video')
-        data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
+        data_video = self._parse_json(self._html_search_regex(
+            r'data-video=(["\'])({.*?})\1', webpage,
+            'video data', group=2), page_id)
+        title = compat_urllib_parse_unquote(data_video['title'])
          streams = data_video['videoStreams']
-
-        manifest_url = None
          formats = []
-        f4m_url = streams.get('f4m_stream')
-        if f4m_url:
-            manifest_url = f4m_url
-            formats.extend(self._extract_f4m_formats(
-                f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
-        m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream'))
+
+        m3u8_url = streams.get('adaptive_stream')
          if m3u8_url:
-            manifest_url = m3u8_url
              m3u8_formats = self._extract_m3u8_formats(
                  m3u8_url, page_id, 'mp4', 'm3u8_native',
                  m3u8_id='hls', fatal=False)
-            formats.extend(m3u8_formats)
-        progressive_url = dict_get(
-            streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr'))
-        if progressive_url and manifest_url:
-            qualities_basename = self._search_regex(
-                r'/([^/]+)\.csmil/',
-                manifest_url, 'qualities basename', default=None)
-            if qualities_basename:
-                QUALITIES_RE = r'((,\d+)+,?)'
-                qualities = self._search_regex(
-                    QUALITIES_RE, qualities_basename,
-                    'qualities', default=None)
-                if qualities:
-                    qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
-                    qualities.sort()
-                    http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
-                    http_url_basename = url_basename(progressive_url)
-                    if m3u8_formats:
-                        self._sort_formats(m3u8_formats)
-                        m3u8_formats = list(filter(
-                            lambda f: f.get('vcodec') != 'none', m3u8_formats))
-                    if len(qualities) == len(m3u8_formats):
-                        for q, m3u8_format in zip(qualities, m3u8_formats):
-                            f = m3u8_format.copy()
-                            f.update({
-                                'url': progressive_url.replace(
-                                    http_url_basename, http_template % q),
-                                'format_id': f['format_id'].replace('hls', 'http'),
-                                'protocol': 'http',
-                            })
-                            formats.append(f)
-                    else:
-                        for q in qualities:
-                            formats.append({
-                                'url': progressive_url.replace(
-                                    http_url_basename, http_template % q),
-                                'ext': 'mp4',
-                                'format_id': 'http-%d' % q,
-                                'tbr': q,
-                            })
+            for f in m3u8_formats:
+                formats.append(f)
+                http_f = f.copy()
+                del http_f['manifest_url']
+                http_f.update({
+                    'format_id': f['format_id'].replace('hls-', 'http-'),
+                    'protocol': 'http',
+                    'url': f['url'].replace('.m3u8', '.mp4'),
+                })
+                formats.append(http_f)
  
-        onceux_json = self._search_regex(
-            r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
-        if onceux_json:
-            onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
-            if onceux_url:
-                formats.extend(self._extract_once_formats(re.sub(
-                    r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url),
-                    http_formats_preference=-1))
+        mpd_url = streams.get('adaptive_dash')
+        if mpd_url:
+            formats.extend(self._extract_mpd_formats(
+                mpd_url, page_id, mpd_id='dash', fatal=False))
  
-        if not formats:
-            for quality in ['sd', 'hd']:
-                # It's actually a link to a flv file
-                flv_url = streams.get('f4m_{0}'.format(quality))
-                if flv_url is not None:
-                    formats.append({
-                        'url': flv_url,
-                        'ext': 'flv',
-                        'format_id': quality,
-                    })
          self._sort_formats(formats)
  
          return {
-            'id': data_video['guid'],
+            'id': data_video.get('guid') or page_id,
              'display_id': page_id,
-            'title': compat_urllib_parse_unquote(data_video['title']),
+            'title': title,
              'formats': formats,
              'description': self._html_search_meta('description', webpage),
              'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py

index e5d29f316e83450fa6f7f56434219b6a90d552be..6246b8a8397aa3cf8330ca0f51b0b55ab8d294fd 100644 (file)
--- a/youtube_dlc/extractor/generic.py
+++ b/youtube_dlc/extractor/generic.py
@@ -29,9 +29,11 @@
      sanitized_Request,
      smuggle_url,
      unescapeHTML,
-    unified_strdate,
+    unified_timestamp,
      unsmuggle_url,
      UnsupportedError,
+    url_or_none,
+    xpath_attr,
      xpath_text,
  )
  from .commonprotocols import RtmpIE
@@ -48,7 +50,6 @@
  from .rutv import RUTVIE
  from .tvc import TVCIE
  from .sportbox import SportBoxIE
-from .smotri import SmotriIE
  from .myvi import MyviIE
  from .condenast import CondeNastIE
  from .udn import UDNEmbedIE
@@ -63,7 +64,10 @@
  from .mofosex import MofosexEmbedIE
  from .spankwire import SpankwireIE
  from .youporn import YouPornIE
-from .vimeo import VimeoIE
+from .vimeo import (
+    VimeoIE,
+    VHXEmbedIE,
+)
  from .dailymotion import DailymotionIE
  from .dailymail import DailyMailIE
  from .onionstudios import OnionStudiosIE
@@ -123,6 +127,7 @@
  from .gedi import GediEmbedsIE
  from .rcs import RCSEmbedsIE
  from .bitchute import BitChuteIE
+from .arcpublishing import ArcPublishingIE
  
  
  class GenericIE(InfoExtractor):
@@ -201,11 +206,46 @@ class GenericIE(InfoExtractor):
          {
              'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
              'info_dict': {
-                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-                'ext': 'm4v',
-                'upload_date': '20150228',
-                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
-            }
+                'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+                'title': 'MSNBC Rachel Maddow (video)',
+                'description': 're:.*her unique approach to storytelling.*',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'ext': 'mov',
+                    'id': 'pdv_maddow_netcast_mov-12-03-2020-223726',
+                    'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726',
+                    'description': 're:.*her unique approach to storytelling.*',
+                    'upload_date': '20201204',
+                },
+            }],
+        },
+        # RSS feed with item with description and thumbnails
+        {
+            'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
+            'info_dict': {
+                'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
+                'title': 're:.*100% Hydrogen.*',
+                'description': 're:.*In this episode.*',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'ext': 'm4a',
+                    'id': 'c1c879525ce2cb640b344507e682c36d',
+                    'title': 're:Hydrogen!',
+                    'description': 're:.*In this episode we are going.*',
+                    'timestamp': 1567977776,
+                    'upload_date': '20190908',
+                    'duration': 459,
+                    'thumbnail': r're:^https?://.*\.jpg$',
+                    'episode_number': 1,
+                    'season_number': 1,
+                    'age_limit': 0,
+                },
+            }],
+            'params': {
+                'skip_download': True,
+            },
          },
          # RSS feed with enclosures and unsupported link URLs
          {
@@ -1986,22 +2026,6 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [SpringboardPlatformIE.ie_key()],
          },
-        {
-            'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
-            'info_dict': {
-                'id': 'uPDB5I9wfp8',
-                'ext': 'webm',
-                'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
-                'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
-                'upload_date': '20160219',
-                'uploader': 'Pocoyo - Português (BR)',
-                'uploader_id': 'PocoyoBrazil',
-            },
-            'add_ie': [YoutubeIE.ie_key()],
-            'params': {
-                'skip_download': True,
-            },
-        },
          {
              'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
              'info_dict': {
@@ -2106,23 +2130,23 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
-        {
-            # Zype embed
-            'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
-            'info_dict': {
-                'id': '5b400b834b32992a310622b9',
-                'ext': 'mp4',
-                'title': 'Smoky Barbecue Favorites',
-                'thumbnail': r're:^https?://.*\.jpe?g',
-                'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
-                'upload_date': '20170909',
-                'timestamp': 1504915200,
-            },
-            'add_ie': [ZypeIE.ie_key()],
-            'params': {
-                'skip_download': True,
-            },
-        },
+        # {
+        #     # Zype embed
+        #     'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+        #     'info_dict': {
+        #         'id': '5b400b834b32992a310622b9',
+        #         'ext': 'mp4',
+        #         'title': 'Smoky Barbecue Favorites',
+        #         'thumbnail': r're:^https?://.*\.jpe?g',
+        #         'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+        #         'upload_date': '20170909',
+        #         'timestamp': 1504915200,
+        #     },
+        #     'add_ie': [ZypeIE.ie_key()],
+        #     'params': {
+        #         'skip_download': True,
+        #     },
+        # },
          {
              # videojs embed
              'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
@@ -2171,7 +2195,32 @@ class GenericIE(InfoExtractor):
          #     'params': {
          #         'force_generic_extractor': True,
          #     },
-        # }
+        # },
+        {
+            # VHX Embed
+            'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy',
+            'info_dict': {
+                'id': '858208',
+                'ext': 'mp4',
+                'title': 'Untitled',
+                'uploader_id': 'user80538407',
+                'uploader': 'OTT Videos',
+            },
+        },
+        {
+            # ArcPublishing PoWa video player
+            'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/',
+            'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3',
+            'info_dict': {
+                'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
+                'ext': 'mp4',
+                'title': 'Senate candidates wave to voters on Anchorage streets',
+                'description': 'md5:91f51a6511f090617353dc720318b20e',
+                'timestamp': 1604378735,
+                'upload_date': '20201103',
+                'duration': 1581,
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2183,6 +2232,10 @@ def _extract_rss(self, url, video_id, doc):
          playlist_desc_el = doc.find('./channel/description')
          playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
  
+        NS_MAP = {
+            'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+        }
+
          entries = []
          for it in doc.findall('./channel/item'):
              next_url = None
@@ -2198,10 +2251,33 @@ def _extract_rss(self, url, video_id, doc):
              if not next_url:
                  continue
  
+            def itunes(key):
+                return xpath_text(
+                    it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
+                    default=None)
+
+            duration = itunes('duration')
+            explicit = (itunes('explicit') or '').lower()
+            if explicit in ('true', 'yes'):
+                age_limit = 18
+            elif explicit in ('false', 'no'):
+                age_limit = 0
+            else:
+                age_limit = None
+
              entries.append({
                  '_type': 'url_transparent',
                  'url': next_url,
                  'title': it.find('title').text,
+                'description': xpath_text(it, 'description', default=None),
+                'timestamp': unified_timestamp(
+                    xpath_text(it, 'pubDate', default=None)),
+                'duration': int_or_none(duration) or parse_duration(duration),
+                'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
+                'episode': itunes('title'),
+                'episode_number': int_or_none(itunes('episode')),
+                'season_number': int_or_none(itunes('season')),
+                'age_limit': age_limit,
              })
  
          return {
@@ -2321,7 +2397,7 @@ def _real_extract(self, url):
          info_dict = {
              'id': video_id,
              'title': self._generic_title(url),
-            'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+            'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
          }
  
          # Check for direct link to a video
@@ -2427,7 +2503,9 @@ def _real_extract(self, url):
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
-        webpage = compat_urllib_parse_unquote(webpage)
+        # FIXME: unescaping the whole page may break URLs, commenting out for now.
+        # There probably should be a second run of generic extractor on unescaped webpage.
+        # webpage = compat_urllib_parse_unquote(webpage)
  
          # Unescape squarespace embeds to be detected by generic extractor,
          # see https://github.com/ytdl-org/youtube-dl/issues/21294
@@ -2509,6 +2587,10 @@ def _real_extract(self, url):
          if tp_urls:
              return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
  
+        arc_urls = ArcPublishingIE._extract_urls(webpage)
+        if arc_urls:
+            return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
+
          # Look for embedded rtl.nl player
          matches = re.findall(
              r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
@@ -2520,6 +2602,10 @@ def _real_extract(self, url):
          if vimeo_urls:
              return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
  
+        vhx_url = VHXEmbedIE._extract_url(webpage)
+        if vhx_url:
+            return self.url_result(vhx_url, VHXEmbedIE.ie_key())
+
          vid_me_embed_url = self._search_regex(
              r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
              webpage, 'vid.me embed', default=None)
@@ -2775,11 +2861,6 @@ def _real_extract(self, url):
          if mobj is not None:
              return self.url_result(mobj.group('url'))
  
-        # Look for embedded smotri.com player
-        smotri_url = SmotriIE._extract_url(webpage)
-        if smotri_url:
-            return self.url_result(smotri_url, 'Smotri')
-
          # Look for embedded Myvi.ru player
          myvi_url = MyviIE._extract_url(webpage)
          if myvi_url:
diff --git a/youtube_dlc/extractor/go.py b/youtube_dlc/extractor/go.py

index 7a75dfa49841ab02d39d047aded2302f11abee6d..85dc561e2bfd0ca84064f30c3fb7cf221792c2b2 100644 (file)
--- a/youtube_dlc/extractor/go.py
+++ b/youtube_dlc/extractor/go.py
@@ -38,13 +38,17 @@ class GoIE(AdobePassIE):
          'disneynow': {
              'brand': '011',
              'resource_id': 'Disney',
-        }
+        },
+        'fxnow.fxnetworks': {
+            'brand': '025',
+            'requestor_id': 'dtci',
+        },
      }
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
                              (?:(?P<sub_domain>%s)\.)?go|
-                            (?P<sub_domain_2>abc|freeform|disneynow)
+                            (?P<sub_domain_2>abc|freeform|disneynow|fxnow\.fxnetworks)
                          )\.com/
                          (?:
                              (?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)|
@@ -99,6 +103,19 @@ class GoIE(AdobePassIE):
              # m3u8 download
              'skip_download': True,
          },
+    }, {
+        'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841',
+        'info_dict': {
+            'id': 'VDKA12782841',
+            'ext': 'mp4',
+            'title': 'First Look: Better Things - Season 2',
+            'description': 'md5:fa73584a95761c605d9d54904e35b407',
+        },
+        'params': {
+            'geo_bypass_ip_block': '3.244.239.0/24',
+            # m3u8 download
+            'skip_download': True,
+        },
      }, {
          'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
          'only_matching': True,
diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py

index c3eba011442353a4e3c377ee68737993b2573a38..1eeddc3b6cdfa090fb747a77096bbf899b055dc9 100644 (file)
--- a/youtube_dlc/extractor/instagram.py
+++ b/youtube_dlc/extractor/instagram.py
@@ -22,7 +22,7 @@
  
  
  class InstagramIE(InfoExtractor):
-    _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv)/(?P<id>[^/?#&]+))'
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
      _TESTS = [{
          'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
          'md5': '0d2da106a9d2631273e192b372806516',
@@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor):
              'timestamp': 1371748545,
              'upload_date': '20130620',
              'uploader_id': 'naomipq',
-            'uploader': 'Naomi Leonor Phan-Quang',
+            'uploader': 'B E A U T Y  F O R  A S H E S',
              'like_count': int,
              'comment_count': int,
              'comments': list,
@@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor):
      }, {
          'url': 'https://www.instagram.com/tv/aye83DjauH/',
          'only_matching': True,
+    }, {
+        'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
+        'only_matching': True,
      }]
  
      @staticmethod
@@ -122,81 +125,92 @@ def _real_extract(self, url):
  
          webpage = self._download_webpage(url, video_id)
  
-        (video_url, description, thumbnail, timestamp, uploader,
+        (media, video_url, description, thumbnail, timestamp, uploader,
           uploader_id, like_count, comment_count, comments, height,
-         width) = [None] * 11
-
-        shared_data = try_get(webpage,
-                              (lambda x: self._parse_json(
-                                  self._search_regex(
-                                      r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);',
-                                      x, 'additional data', default='{}'),
-                                  video_id, fatal=False),
-                               lambda x: self._parse_json(
-                                  self._search_regex(
-                                      r'window\._sharedData\s*=\s*({.+?});',
-                                      x, 'shared data', default='{}'),
-                                  video_id, fatal=False)['entry_data']['PostPage'][0]),
-                              None)
+         width) = [None] * 12
+
+        shared_data = self._parse_json(
+            self._search_regex(
+                r'window\._sharedData\s*=\s*({.+?});',
+                webpage, 'shared data', default='{}'),
+            video_id, fatal=False)
          if shared_data:
              media = try_get(
                  shared_data,
-                (lambda x: x['graphql']['shortcode_media'],
-                 lambda x: x['media']),
+                (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+                 lambda x: x['entry_data']['PostPage'][0]['media']),
                  dict)
-            if media:
-                video_url = media.get('video_url')
-                height = int_or_none(media.get('dimensions', {}).get('height'))
-                width = int_or_none(media.get('dimensions', {}).get('width'))
-                description = try_get(
-                    media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
-                    compat_str) or media.get('caption')
-                thumbnail = media.get('display_src') or media.get('thumbnail_src')
-                timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
-                uploader = media.get('owner', {}).get('full_name')
-                uploader_id = media.get('owner', {}).get('username')
-
-                def get_count(key, kind):
-                    return int_or_none(try_get(
+        # _sharedData.entry_data.PostPage is empty when authenticated (see
+        # https://github.com/ytdl-org/youtube-dl/pull/22880)
+        if not media:
+            additional_data = self._parse_json(
+                self._search_regex(
+                    r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
+                    webpage, 'additional data', default='{}'),
+                video_id, fatal=False)
+            if additional_data:
+                media = try_get(
+                    additional_data, lambda x: x['graphql']['shortcode_media'],
+                    dict)
+        if media:
+            video_url = media.get('video_url')
+            height = int_or_none(media.get('dimensions', {}).get('height'))
+            width = int_or_none(media.get('dimensions', {}).get('width'))
+            description = try_get(
+                media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                compat_str) or media.get('caption')
+            thumbnail = media.get('display_src') or media.get('display_url')
+            timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
+            uploader = media.get('owner', {}).get('full_name')
+            uploader_id = media.get('owner', {}).get('username')
+
+            def get_count(keys, kind):
+                if not isinstance(keys, (list, tuple)):
+                    keys = [keys]
+                for key in keys:
+                    count = int_or_none(try_get(
                          media, (lambda x: x['edge_media_%s' % key]['count'],
                                  lambda x: x['%ss' % kind]['count'])))
-                like_count = get_count('preview_like', 'like')
-                comment_count = get_count('to_comment', 'comment')
-
-                comments = [{
-                    'author': comment.get('user', {}).get('username'),
-                    'author_id': comment.get('user', {}).get('id'),
-                    'id': comment.get('id'),
-                    'text': comment.get('text'),
-                    'timestamp': int_or_none(comment.get('created_at')),
-                } for comment in media.get(
-                    'comments', {}).get('nodes', []) if comment.get('text')]
-                if not video_url:
-                    edges = try_get(
-                        media, lambda x: x['edge_sidecar_to_children']['edges'],
-                        list) or []
-                    if edges:
-                        entries = []
-                        for edge_num, edge in enumerate(edges, start=1):
-                            node = try_get(edge, lambda x: x['node'], dict)
-                            if not node:
-                                continue
-                            node_video_url = url_or_none(node.get('video_url'))
-                            if not node_video_url:
-                                continue
-                            entries.append({
-                                'id': node.get('shortcode') or node['id'],
-                                'title': 'Video %d' % edge_num,
-                                'url': node_video_url,
-                                'thumbnail': node.get('display_url'),
-                                'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
-                                'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
-                                'view_count': int_or_none(node.get('video_view_count')),
-                            })
-                        return self.playlist_result(
-                            entries, video_id,
-                            'Post by %s' % uploader_id if uploader_id else None,
-                            description)
+                    if count is not None:
+                        return count
+            like_count = get_count('preview_like', 'like')
+            comment_count = get_count(
+                ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
+
+            comments = [{
+                'author': comment.get('user', {}).get('username'),
+                'author_id': comment.get('user', {}).get('id'),
+                'id': comment.get('id'),
+                'text': comment.get('text'),
+                'timestamp': int_or_none(comment.get('created_at')),
+            } for comment in media.get(
+                'comments', {}).get('nodes', []) if comment.get('text')]
+            if not video_url:
+                edges = try_get(
+                    media, lambda x: x['edge_sidecar_to_children']['edges'],
+                    list) or []
+                if edges:
+                    entries = []
+                    for edge_num, edge in enumerate(edges, start=1):
+                        node = try_get(edge, lambda x: x['node'], dict)
+                        if not node:
+                            continue
+                        node_video_url = url_or_none(node.get('video_url'))
+                        if not node_video_url:
+                            continue
+                        entries.append({
+                            'id': node.get('shortcode') or node['id'],
+                            'title': 'Video %d' % edge_num,
+                            'url': node_video_url,
+                            'thumbnail': node.get('display_url'),
+                            'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
+                            'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
+                            'view_count': int_or_none(node.get('video_view_count')),
+                        })
+                    return self.playlist_result(
+                        entries, video_id,
+                        'Post by %s' % uploader_id if uploader_id else None,
+                        description)
  
          if not video_url:
              video_url = self._og_search_video_url(webpage, secure=False)
diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py

index 20144cd829d7f889c99390aebcfc2b2cad20373e..b767ca0ddc9cf53d540bc1d5848736586d6f5e19 100644 (file)
--- a/youtube_dlc/extractor/itv.py
+++ b/youtube_dlc/extractor/itv.py
@@ -1,30 +1,22 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-import uuid
-import xml.etree.ElementTree as etree
  import json
  import re
  
  from .common import InfoExtractor
  from .brightcove import BrightcoveNewIE
-from ..compat import (
-    compat_str,
-    compat_etree_register_namespace,
-)
  from ..utils import (
+    clean_html,
      determine_ext,
-    ExtractorError,
      extract_attributes,
-    int_or_none,
+    get_element_by_class,
+    JSON_LD_RE,
      merge_dicts,
      parse_duration,
      smuggle_url,
      try_get,
      url_or_none,
-    xpath_with_ns,
-    xpath_element,
-    xpath_text,
  )
  
  
@@ -32,14 +24,18 @@ class ITVIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
      _GEO_COUNTRIES = ['GB']
      _TESTS = [{
-        'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
+        'url': 'https://www.itv.com/hub/liar/2a4547a0012',
          'info_dict': {
-            'id': '2a2936a0053',
-            'ext': 'flv',
-            'title': 'Home Movie',
+            'id': '2a4547a0012',
+            'ext': 'mp4',
+            'title': 'Liar - Series 2 - Episode 6',
+            'description': 'md5:d0f91536569dec79ea184f0a44cca089',
+            'series': 'Liar',
+            'season_number': 2,
+            'episode_number': 6,
          },
          'params': {
-            # rtmp download
+            # m3u8 download
              'skip_download': True,
          },
      }, {
@@ -62,220 +58,97 @@ def _real_extract(self, url):
          params = extract_attributes(self._search_regex(
              r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
  
-        ns_map = {
-            'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
-            'tem': 'http://tempuri.org/',
-            'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
-            'com': 'http://schemas.itv.com/2009/05/Common',
-        }
-        for ns, full_ns in ns_map.items():
-            compat_etree_register_namespace(ns, full_ns)
-
-        def _add_ns(name):
-            return xpath_with_ns(name, ns_map)
-
-        def _add_sub_element(element, name):
-            return etree.SubElement(element, _add_ns(name))
-
-        production_id = (
-            params.get('data-video-autoplay-id')
-            or '%s#001' % (
-                params.get('data-video-episode-id')
-                or video_id.replace('a', '/')))
-
-        req_env = etree.Element(_add_ns('soapenv:Envelope'))
-        _add_sub_element(req_env, 'soapenv:Header')
-        body = _add_sub_element(req_env, 'soapenv:Body')
-        get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
-        request = _add_sub_element(get_playlist, 'tem:request')
-        _add_sub_element(request, 'itv:ProductionId').text = production_id
-        _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
-        vodcrid = _add_sub_element(request, 'itv:Vodcrid')
-        _add_sub_element(vodcrid, 'com:Id')
-        _add_sub_element(request, 'itv:Partition')
-        user_info = _add_sub_element(get_playlist, 'tem:userInfo')
-        _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
-        _add_sub_element(user_info, 'itv:DM')
-        _add_sub_element(user_info, 'itv:RevenueScienceValue')
-        _add_sub_element(user_info, 'itv:SessionId')
-        _add_sub_element(user_info, 'itv:SsoToken')
-        _add_sub_element(user_info, 'itv:UserToken')
-        site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
-        _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
-        _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
-        _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
-        _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
-        _add_sub_element(site_info, 'itv:Category')
-        _add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
-        _add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
-        device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
-        _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
-        player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
-        _add_sub_element(player_info, 'itv:Version').text = '2'
-
+        ios_playlist_url = params.get('data-video-playlist') or params['data-video-id']
+        hmac = params['data-video-hmac']
          headers = self.geo_verification_headers()
          headers.update({
-            'Content-Type': 'text/xml; charset=utf-8',
-            'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
+            'Accept': 'application/vnd.itv.vod.playlist.v2+json',
+            'Content-Type': 'application/json',
+            'hmac': hmac.upper(),
          })
+        ios_playlist = self._download_json(
+            ios_playlist_url, video_id, data=json.dumps({
+                'user': {
+                    'itvUserId': '',
+                    'entitlements': [],
+                    'token': ''
+                },
+                'device': {
+                    'manufacturer': 'Safari',
+                    'model': '5',
+                    'os': {
+                        'name': 'Windows NT',
+                        'version': '6.1',
+                        'type': 'desktop'
+                    }
+                },
+                'client': {
+                    'version': '4.1',
+                    'id': 'browser'
+                },
+                'variantAvailability': {
+                    'featureset': {
+                        'min': ['hls', 'aes', 'outband-webvtt'],
+                        'max': ['hls', 'aes', 'outband-webvtt']
+                    },
+                    'platformTag': 'dotcom'
+                }
+            }).encode(), headers=headers)
+        video_data = ios_playlist['Playlist']['Video']
+        ios_base_url = video_data.get('Base')
  
-        info = self._search_json_ld(webpage, video_id, default={})
          formats = []
-        subtitles = {}
-
-        def extract_subtitle(sub_url):
-            ext = determine_ext(sub_url, 'ttml')
-            subtitles.setdefault('en', []).append({
-                'url': sub_url,
-                'ext': 'ttml' if ext == 'xml' else ext,
-            })
-
-        resp_env = self._download_xml(
-            params['data-playlist-url'], video_id,
-            headers=headers, data=etree.tostring(req_env), fatal=False)
-        if resp_env:
-            playlist = xpath_element(resp_env, './/Playlist')
-            if playlist is None:
-                fault_code = xpath_text(resp_env, './/faultcode')
-                fault_string = xpath_text(resp_env, './/faultstring')
-                if fault_code == 'InvalidGeoRegion':
-                    self.raise_geo_restricted(
-                        msg=fault_string, countries=self._GEO_COUNTRIES)
-                elif fault_code not in (
-                        'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
-                    raise ExtractorError(
-                        '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
-                info.update({
-                    'title': self._og_search_title(webpage),
-                    'episode_title': params.get('data-video-episode'),
-                    'series': params.get('data-video-title'),
-                })
+        for media_file in (video_data.get('MediaFiles') or []):
+            href = media_file.get('Href')
+            if not href:
+                continue
+            if ios_base_url:
+                href = ios_base_url + href
+            ext = determine_ext(href)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    href, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
              else:
-                title = xpath_text(playlist, 'EpisodeTitle', default=None)
-                info.update({
-                    'title': title,
-                    'episode_title': title,
-                    'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
-                    'series': xpath_text(playlist, 'ProgrammeTitle'),
-                    'duration': parse_duration(xpath_text(playlist, 'Duration')),
+                formats.append({
+                    'url': href,
                  })
-                video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
-                media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
-                rtmp_url = media_files.attrib['base']
-
-                for media_file in media_files.findall('MediaFile'):
-                    play_path = xpath_text(media_file, 'URL')
-                    if not play_path:
-                        continue
-                    tbr = int_or_none(media_file.get('bitrate'), 1000)
-                    f = {
-                        'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
-                        'play_path': play_path,
-                        # Providing this swfVfy allows to avoid truncated downloads
-                        'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
-                        'page_url': url,
-                        'tbr': tbr,
-                        'ext': 'flv',
-                    }
-                    app = self._search_regex(
-                        'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
-                    if app:
-                        f.update({
-                            'url': rtmp_url.split('?', 1)[0],
-                            'app': app,
-                        })
-                    else:
-                        f['url'] = rtmp_url
-                    formats.append(f)
-
-                for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
-                    if caption_url.text:
-                        extract_subtitle(caption_url.text)
+        self._sort_formats(formats)
  
-        ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
-        hmac = params.get('data-video-hmac')
-        if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
-            headers = self.geo_verification_headers()
-            headers.update({
-                'Accept': 'application/vnd.itv.vod.playlist.v2+json',
-                'Content-Type': 'application/json',
-                'hmac': hmac.upper(),
+        subtitles = {}
+        subs = video_data.get('Subtitles') or []
+        for sub in subs:
+            if not isinstance(sub, dict):
+                continue
+            href = url_or_none(sub.get('Href'))
+            if not href:
+                continue
+            subtitles.setdefault('en', []).append({
+                'url': href,
+                'ext': determine_ext(href, 'vtt'),
              })
-            ios_playlist = self._download_json(
-                ios_playlist_url, video_id, data=json.dumps({
-                    'user': {
-                        'itvUserId': '',
-                        'entitlements': [],
-                        'token': ''
-                    },
-                    'device': {
-                        'manufacturer': 'Safari',
-                        'model': '5',
-                        'os': {
-                            'name': 'Windows NT',
-                            'version': '6.1',
-                            'type': 'desktop'
-                        }
-                    },
-                    'client': {
-                        'version': '4.1',
-                        'id': 'browser'
-                    },
-                    'variantAvailability': {
-                        'featureset': {
-                            'min': ['hls', 'aes', 'outband-webvtt'],
-                            'max': ['hls', 'aes', 'outband-webvtt']
-                        },
-                        'platformTag': 'dotcom'
-                    }
-                }).encode(), headers=headers, fatal=False)
-            if ios_playlist:
-                video_data = ios_playlist.get('Playlist', {}).get('Video', {})
-                ios_base_url = video_data.get('Base')
-                for media_file in video_data.get('MediaFiles', []):
-                    href = media_file.get('Href')
-                    if not href:
-                        continue
-                    if ios_base_url:
-                        href = ios_base_url + href
-                    ext = determine_ext(href)
-                    if ext == 'm3u8':
-                        formats.extend(self._extract_m3u8_formats(
-                            href, video_id, 'mp4', entry_protocol='m3u8_native',
-                            m3u8_id='hls', fatal=False))
-                    else:
-                        formats.append({
-                            'url': href,
-                        })
-                subs = video_data.get('Subtitles')
-                if isinstance(subs, list):
-                    for sub in subs:
-                        if not isinstance(sub, dict):
-                            continue
-                        href = url_or_none(sub.get('Href'))
-                        if href:
-                            extract_subtitle(href)
-                if not info.get('duration'):
-                    info['duration'] = parse_duration(video_data.get('Duration'))
-
-        self._sort_formats(formats)
  
-        info.update({
+        info = self._search_json_ld(webpage, video_id, default={})
+        if not info:
+            json_ld = self._parse_json(self._search_regex(
+                JSON_LD_RE, webpage, 'JSON-LD', '{}',
+                group='json_ld'), video_id, fatal=False)
+            if json_ld and json_ld.get('@type') == 'BreadcrumbList':
+                for ile in (json_ld.get('itemListElement:') or []):
+                    item = ile.get('item:') or {}
+                    if item.get('@type') == 'TVEpisode':
+                        item['@context'] = 'http://schema.org'
+                        info = self._json_ld(item, video_id, fatal=False) or {}
+                        break
+
+        return merge_dicts({
              'id': video_id,
+            'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
              'formats': formats,
              'subtitles': subtitles,
-        })
-
-        webpage_info = self._search_json_ld(webpage, video_id, default={})
-        if not webpage_info.get('title'):
-            webpage_info['title'] = self._html_search_regex(
-                r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
-                webpage, 'title', default=None) or self._og_search_title(
-                webpage, default=None) or self._html_search_meta(
-                'twitter:title', webpage, 'title',
-                default=None) or webpage_info['episode']
-
-        return merge_dicts(info, webpage_info)
+            'duration': parse_duration(video_data.get('Duration')),
+            'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)),
+        }, info)
  
  
  class ITVBTCCIE(InfoExtractor):
diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py

index 6177297ab627e15af6e37b48eda906efbe30375e..41cc245ebdccabd0b00c288b8914a19aa72a256d 100644 (file)
--- a/youtube_dlc/extractor/lbry.py
+++ b/youtube_dlc/extractor/lbry.py
@@ -1,6 +1,7 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import functools
  import json
  
  from .common import InfoExtractor
@@ -10,13 +11,73 @@
      ExtractorError,
      int_or_none,
      mimetype2ext,
+    OnDemandPagedList,
      try_get,
+    urljoin,
  )
  
  
-class LBRYIE(InfoExtractor):
-    IE_NAME = 'lbry.tv'
-    _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])'
+class LBRYBaseIE(InfoExtractor):
+    _BASE_URL_REGEX = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/'
+    _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}'
+    _OPT_CLAIM_ID = '[^:/?#&]+(?::%s)?' % _CLAIM_ID_REGEX
+    _SUPPORTED_STREAM_TYPES = ['video', 'audio']
+
+    def _call_api_proxy(self, method, display_id, params, resource):
+        return self._download_json(
+            'https://api.lbry.tv/api/v1/proxy',
+            display_id, 'Downloading %s JSON metadata' % resource,
+            headers={'Content-Type': 'application/json-rpc'},
+            data=json.dumps({
+                'method': method,
+                'params': params,
+            }).encode())['result']
+
+    def _resolve_url(self, url, display_id, resource):
+        return self._call_api_proxy(
+            'resolve', display_id, {'urls': url}, resource)[url]
+
+    def _permanent_url(self, url, claim_name, claim_id):
+        return urljoin(url, '/%s:%s' % (claim_name, claim_id))
+
+    def _parse_stream(self, stream, url):
+        stream_value = stream.get('value') or {}
+        stream_type = stream_value.get('stream_type')
+        source = stream_value.get('source') or {}
+        media = stream_value.get(stream_type) or {}
+        signing_channel = stream.get('signing_channel') or {}
+        channel_name = signing_channel.get('name')
+        channel_claim_id = signing_channel.get('claim_id')
+        channel_url = None
+        if channel_name and channel_claim_id:
+            channel_url = self._permanent_url(url, channel_name, channel_claim_id)
+
+        info = {
+            'thumbnail': try_get(stream_value, lambda x: x['thumbnail']['url'], compat_str),
+            'description': stream_value.get('description'),
+            'license': stream_value.get('license'),
+            'timestamp': int_or_none(stream.get('timestamp')),
+            'tags': stream_value.get('tags'),
+            'duration': int_or_none(media.get('duration')),
+            'channel': try_get(signing_channel, lambda x: x['value']['title']),
+            'channel_id': channel_claim_id,
+            'channel_url': channel_url,
+            'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
+            'filesize': int_or_none(source.get('size')),
+        }
+        if stream_type == 'audio':
+            info['vcodec'] = 'none'
+        else:
+            info.update({
+                'width': int_or_none(media.get('width')),
+                'height': int_or_none(media.get('height')),
+            })
+        return info
+
+
+class LBRYIE(LBRYBaseIE):
+    IE_NAME = 'lbry'
+    _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX)
      _TESTS = [{
          # Video
          'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
@@ -28,6 +89,8 @@ class LBRYIE(InfoExtractor):
              'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
              'timestamp': 1595694354,
              'upload_date': '20200725',
+            'width': 1280,
+            'height': 720,
          }
      }, {
          # Audio
@@ -40,6 +103,12 @@ class LBRYIE(InfoExtractor):
              'description': 'md5:661ac4f1db09f31728931d7b88807a61',
              'timestamp': 1591312601,
              'upload_date': '20200604',
+            'tags': list,
+            'duration': 2570,
+            'channel': 'The LBRY Foundation',
+            'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212',
+            'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212',
+            'vcodec': 'none',
          }
      }, {
          'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
@@ -47,45 +116,99 @@ class LBRYIE(InfoExtractor):
      }, {
          'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b",
          'only_matching': True,
+    }, {
+        'url': 'https://lbry.tv/Episode-1:e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+        'only_matching': True,
+    }, {
+        'url': 'https://lbry.tv/$/embed/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+        'only_matching': True,
+    }, {
+        'url': 'https://lbry.tv/Episode-1:e7',
+        'only_matching': True,
+    }, {
+        'url': 'https://lbry.tv/@LBRYFoundation/Episode-1',
+        'only_matching': True,
+    }, {
+        'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+        'only_matching': True,
      }]
  
-    def _call_api_proxy(self, method, display_id, params):
-        return self._download_json(
-            'https://api.lbry.tv/api/v1/proxy', display_id,
-            headers={'Content-Type': 'application/json-rpc'},
-            data=json.dumps({
-                'method': method,
-                'params': params,
-            }).encode())['result']
-
      def _real_extract(self, url):
-        display_id = self._match_id(url).replace(':', '#')
+        display_id = self._match_id(url)
+        if display_id.startswith('$/'):
+            display_id = display_id.split('/', 2)[-1].replace('/', ':')
+        else:
+            display_id = display_id.replace(':', '#')
          uri = 'lbry://' + display_id
-        result = self._call_api_proxy(
-            'resolve', display_id, {'urls': [uri]})[uri]
+        result = self._resolve_url(uri, display_id, 'stream')
          result_value = result['value']
-        if result_value.get('stream_type') not in ('video', 'audio'):
+        if result_value.get('stream_type') not in self._SUPPORTED_STREAM_TYPES:
              raise ExtractorError('Unsupported URL', expected=True)
+        claim_id = result['claim_id']
+        title = result_value['title']
          streaming_url = self._call_api_proxy(
-            'get', display_id, {'uri': uri})['streaming_url']
-        source = result_value.get('source') or {}
-        media = result_value.get('video') or result_value.get('audio') or {}
-        signing_channel = result_value.get('signing_channel') or {}
-
-        return {
-            'id': result['claim_id'],
-            'title': result_value['title'],
-            'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
-            'description': result_value.get('description'),
-            'license': result_value.get('license'),
-            'timestamp': int_or_none(result.get('timestamp')),
-            'tags': result_value.get('tags'),
-            'width': int_or_none(media.get('width')),
-            'height': int_or_none(media.get('height')),
-            'duration': int_or_none(media.get('duration')),
-            'channel': signing_channel.get('name'),
-            'channel_id': signing_channel.get('claim_id'),
-            'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
-            'filesize': int_or_none(source.get('size')),
+            'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
+        info = self._parse_stream(result, url)
+        info.update({
+            'id': claim_id,
+            'title': title,
              'url': streaming_url,
-        }
+        })
+        return info
+
+
+class LBRYChannelIE(LBRYBaseIE):
+    IE_NAME = 'lbry:channel'
+    _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P<id>@%s)/?(?:[?#&]|$)' % LBRYBaseIE._OPT_CLAIM_ID
+    _TESTS = [{
+        'url': 'https://lbry.tv/@LBRYFoundation:0',
+        'info_dict': {
+            'id': '0ed629d2b9c601300cacf7eabe9da0be79010212',
+            'title': 'The LBRY Foundation',
+            'description': 'Channel for the LBRY Foundation. Follow for updates and news.',
+        },
+        'playlist_count': 29,
+    }, {
+        'url': 'https://lbry.tv/@LBRYFoundation',
+        'only_matching': True,
+    }]
+    _PAGE_SIZE = 50
+
+    def _fetch_page(self, claim_id, url, page):
+        page += 1
+        result = self._call_api_proxy(
+            'claim_search', claim_id, {
+                'channel_ids': [claim_id],
+                'claim_type': 'stream',
+                'no_totals': True,
+                'page': page,
+                'page_size': self._PAGE_SIZE,
+                'stream_types': self._SUPPORTED_STREAM_TYPES,
+            }, 'page %d' % page)
+        for item in (result.get('items') or []):
+            stream_claim_name = item.get('name')
+            stream_claim_id = item.get('claim_id')
+            if not (stream_claim_name and stream_claim_id):
+                continue
+
+            info = self._parse_stream(item, url)
+            info.update({
+                '_type': 'url',
+                'id': stream_claim_id,
+                'title': try_get(item, lambda x: x['value']['title']),
+                'url': self._permanent_url(url, stream_claim_name, stream_claim_id),
+            })
+            yield info
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url).replace(':', '#')
+        result = self._resolve_url(
+            'lbry://' + display_id, display_id, 'channel')
+        claim_id = result['claim_id']
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, claim_id, url),
+            self._PAGE_SIZE)
+        result_value = result.get('value') or {}
+        return self.playlist_result(
+            entries, claim_id, result_value.get('title'),
+            result_value.get('description'))
diff --git a/youtube_dlc/extractor/linuxacademy.py b/youtube_dlc/extractor/linuxacademy.py

index 23ca965d977b1ec682101f048684f20f1b70834c..7ec4a65573afe2d40a72c20c7f799b3f5ed0baef 100644 (file)
--- a/youtube_dlc/extractor/linuxacademy.py
+++ b/youtube_dlc/extractor/linuxacademy.py
@@ -8,11 +8,15 @@
  from ..compat import (
      compat_b64decode,
      compat_HTTPError,
+    compat_str,
  )
  from ..utils import (
+    clean_html,
      ExtractorError,
-    orderedSet,
-    unescapeHTML,
+    js_to_json,
+    parse_duration,
+    try_get,
+    unified_timestamp,
      urlencode_postdata,
      urljoin,
  )
@@ -28,11 +32,15 @@ class LinuxAcademyIE(InfoExtractor):
                          )
                      '''
      _TESTS = [{
-        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
+        'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675',
          'info_dict': {
-            'id': '1498-2',
+            'id': '7971-2',
              'ext': 'mp4',
-            'title': "Introduction to the Practitioner's Brief",
+            'title': 'What Is Data Science',
+            'description': 'md5:c574a3c20607144fb36cb65bdde76c99',
+            'timestamp': 1607387907,
+            'upload_date': '20201208',
+            'duration': 304,
          },
          'params': {
              'skip_download': True,
@@ -46,7 +54,8 @@ class LinuxAcademyIE(InfoExtractor):
          'info_dict': {
              'id': '154',
              'title': 'AWS Certified Cloud Practitioner',
-            'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
+            'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c',
+            'duration': 28835,
          },
          'playlist_count': 41,
          'skip': 'Requires Linux Academy account credentials',
@@ -74,6 +83,7 @@ def random_string():
              self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
                  'client_id': self._CLIENT_ID,
                  'response_type': 'token id_token',
+                'response_mode': 'web_message',
                  'redirect_uri': self._ORIGIN_URL,
                  'scope': 'openid email user_impersonation profile',
                  'audience': self._ORIGIN_URL,
@@ -129,7 +139,13 @@ def random_string():
  
          access_token = self._search_regex(
              r'access_token=([^=&]+)', urlh.geturl(),
-            'access token')
+            'access token', default=None)
+        if not access_token:
+            access_token = self._parse_json(
+                self._search_regex(
+                    r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page,
+                    'authorization response'), None,
+                transform_source=js_to_json)['response']['access_token']
  
          self._download_webpage(
              'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
@@ -144,30 +160,84 @@ def _real_extract(self, url):
  
          # course path
          if course_id:
-            entries = [
-                self.url_result(
-                    urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
-                for lesson_url in orderedSet(re.findall(
-                    r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
-                    webpage))]
-            title = unescapeHTML(self._html_search_regex(
-                (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
-                 r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
-                webpage, 'title', default=None, group='value'))
-            description = unescapeHTML(self._html_search_regex(
-                r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
-                webpage, 'description', default=None, group='value'))
-            return self.playlist_result(entries, course_id, title, description)
+            module = self._parse_json(
+                self._search_regex(
+                    r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'),
+                item_id)
+            entries = []
+            chapter_number = None
+            chapter = None
+            chapter_id = None
+            for item in module['items']:
+                if not isinstance(item, dict):
+                    continue
+
+                def type_field(key):
+                    return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower()
+                type_fields = (type_field('name'), type_field('slug'))
+                # Move to next module section
+                if 'section' in type_fields:
+                    chapter = item.get('course_name')
+                    chapter_id = item.get('course_module')
+                    chapter_number = 1 if not chapter_number else chapter_number + 1
+                    continue
+                # Skip non-lessons
+                if 'lesson' not in type_fields:
+                    continue
+                lesson_url = urljoin(url, item.get('url'))
+                if not lesson_url:
+                    continue
+                title = item.get('title') or item.get('lesson_name')
+                description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text'))
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': lesson_url,
+                    'ie_key': LinuxAcademyIE.ie_key(),
+                    'title': title,
+                    'description': description,
+                    'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')),
+                    'duration': parse_duration(item.get('duration')),
+                    'chapter': chapter,
+                    'chapter_id': chapter_id,
+                    'chapter_number': chapter_number,
+                })
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+                'id': course_id,
+                'title': module.get('title'),
+                'description': module.get('md_desc') or clean_html(module.get('desc')),
+                'duration': parse_duration(module.get('duration')),
+            }
  
          # single video path
-        info = self._extract_jwplayer_data(
-            webpage, item_id, require_title=False, m3u8_id='hls',)
-        title = self._search_regex(
-            (r'>Lecture\s*:\s*(?P<value>[^<]+)',
-             r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
-            'title', group='value')
-        info.update({
+        m3u8_url = self._parse_json(
+            self._search_regex(
+                r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'),
+            item_id)[0]['file']
+        formats = self._extract_m3u8_formats(
+            m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+        info = {
              'id': item_id,
-            'title': title,
-        })
+            'formats': formats,
+        }
+        lesson = self._parse_json(
+            self._search_regex(
+                (r'window\.lesson\s*=\s*({.+?})\s*;',
+                 r'player\.lesson\s*=\s*({.+?})\s*;'),
+                webpage, 'lesson', default='{}'), item_id, fatal=False)
+        if lesson:
+            info.update({
+                'title': lesson.get('lesson_name'),
+                'description': lesson.get('md_desc') or clean_html(lesson.get('desc')),
+                'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')),
+                'duration': parse_duration(lesson.get('duration')),
+            })
+        if not info.get('title'):
+            info['title'] = self._search_regex(
+                (r'>Lecture\s*:\s*(?P<value>[^<]+)',
+                 r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+                'title', group='value')
          return info
diff --git a/youtube_dlc/extractor/mdr.py b/youtube_dlc/extractor/mdr.py

index 322e5b45a141bacd327288ac58ed658a46e628f7..dc6aa981959014b6b7b9167b3974ba9d092b9b71 100644 (file)
--- a/youtube_dlc/extractor/mdr.py
+++ b/youtube_dlc/extractor/mdr.py
@@ -2,12 +2,16 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
  from ..utils import (
      determine_ext,
      int_or_none,
      parse_duration,
      parse_iso8601,
+    url_or_none,
      xpath_text,
  )
  
@@ -16,6 +20,8 @@ class MDRIE(InfoExtractor):
      IE_DESC = 'MDR.DE and KiKA'
      _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html'
  
+    _GEO_COUNTRIES = ['DE']
+
      _TESTS = [{
          # MDR regularly deletes its videos
          'url': 'http://www.mdr.de/fakt/video189002.html',
@@ -66,6 +72,22 @@ class MDRIE(InfoExtractor):
              'duration': 3239,
              'uploader': 'MITTELDEUTSCHER RUNDFUNK',
          },
+    }, {
+        # empty bitrateVideo and bitrateAudio
+        'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html',
+        'info_dict': {
+            'id': '128372',
+            'ext': 'mp4',
+            'title': 'Der kleine Wichtel kehrt zurück',
+            'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a',
+            'duration': 4876,
+            'timestamp': 1607823300,
+            'upload_date': '20201213',
+            'uploader': 'ZDF',
+        },
+        'params': {
+            'skip_download': True,
+        },
      }, {
          'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
          'only_matching': True,
@@ -91,10 +113,13 @@ def _real_extract(self, url):
  
          title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True)
  
+        type_ = xpath_text(doc, './type', default=None)
+
          formats = []
          processed_urls = []
          for asset in doc.findall('./assets/asset'):
              for source in (
+                    'download',
                      'progressiveDownload',
                      'dynamicHttpStreamingRedirector',
                      'adaptiveHttpStreamingRedirector'):
@@ -102,63 +127,49 @@ def _real_extract(self, url):
                  if url_el is None:
                      continue
  
-                video_url = url_el.text
-                if video_url in processed_urls:
+                video_url = url_or_none(url_el.text)
+                if not video_url or video_url in processed_urls:
                      continue
  
                  processed_urls.append(video_url)
  
-                vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
-                abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
-
-                ext = determine_ext(url_el.text)
+                ext = determine_ext(video_url)
                  if ext == 'm3u8':
-                    url_formats = self._extract_m3u8_formats(
+                    formats.extend(self._extract_m3u8_formats(
                          video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                        preference=0, m3u8_id='HLS', fatal=False)
+                        preference=0, m3u8_id='HLS', fatal=False))
                  elif ext == 'f4m':
-                    url_formats = self._extract_f4m_formats(
+                    formats.extend(self._extract_f4m_formats(
                          video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
-                        preference=0, f4m_id='HDS', fatal=False)
+                        preference=0, f4m_id='HDS', fatal=False))
                  else:
                      media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
                      vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
                      abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
                      filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))
  
+                    format_id = [media_type]
+                    if vbr or abr:
+                        format_id.append(compat_str(vbr or abr))
+
                      f = {
                          'url': video_url,
-                        'format_id': '%s-%d' % (media_type, vbr or abr),
+                        'format_id': '-'.join(format_id),
                          'filesize': filesize,
                          'abr': abr,
-                        'preference': 1,
+                        'vbr': vbr,
                      }
  
                      if vbr:
-                        width = int_or_none(xpath_text(asset, './frameWidth', 'width'))
-                        height = int_or_none(xpath_text(asset, './frameHeight', 'height'))
                          f.update({
-                            'vbr': vbr,
-                            'width': width,
-                            'height': height,
+                            'width': int_or_none(xpath_text(asset, './frameWidth', 'width')),
+                            'height': int_or_none(xpath_text(asset, './frameHeight', 'height')),
                          })
  
-                    url_formats = [f]
-
-                if not url_formats:
-                    continue
-
-                if not vbr:
-                    for f in url_formats:
-                        abr = f.get('tbr') or abr
-                        if 'tbr' in f:
-                            del f['tbr']
-                        f.update({
-                            'abr': abr,
-                            'vcodec': 'none',
-                        })
+                    if type_ == 'audio':
+                        f['vcodec'] = 'none'
  
-                formats.extend(url_formats)
+                    formats.append(f)
  
          self._sort_formats(formats)
  
diff --git a/youtube_dlc/extractor/mediaset.py b/youtube_dlc/extractor/mediaset.py

index 933df14952d5cc16857485e306be07f2d32384d3..2c16fc9e21887eada088a335a000530a845c3b3b 100644 (file)
--- a/youtube_dlc/extractor/mediaset.py
+++ b/youtube_dlc/extractor/mediaset.py
@@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE):
                          https?://
                              (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
                              (?:
-                                (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+                                (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_|
                                  player/index\.html\?.*?\bprogramGuid=
                              )
                      )(?P<id>[0-9A-Z]{16,})
@@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE):
      }, {
          'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135',
          'only_matching': True,
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102',
+        'only_matching': True,
      }]
  
      @staticmethod
diff --git a/youtube_dlc/extractor/mitele.py b/youtube_dlc/extractor/mitele.py

index 7f5718e211f37837b6e0327e114d127a3685f0f0..0b240d27f5d95e342412131e0f0a0cbd701e8586 100644 (file)
--- a/youtube_dlc/extractor/mitele.py
+++ b/youtube_dlc/extractor/mitele.py
@@ -2,15 +2,14 @@
  from __future__ import unicode_literals
  import json
  
-from .common import InfoExtractor
+from .telecinco import TelecincoIE
  from ..utils import (
      int_or_none,
      parse_iso8601,
-    smuggle_url,
  )
  
  
-class MiTeleIE(InfoExtractor):
+class MiTeleIE(TelecincoIE):
      IE_DESC = 'mitele.es'
      _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
  
@@ -53,7 +52,7 @@ class MiTeleIE(InfoExtractor):
          },
          'params': {
              'skip_download': True,
-        }
+        },
      }, {
          'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player',
          'only_matching': True,
@@ -69,13 +68,11 @@ def _real_extract(self, url):
              r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})',
              webpage, 'Pre Player'), display_id)['prePlayer']
          title = pre_player['title']
-        video = pre_player['video']
-        video_id = video['dataMediaId']
+        video_info = self._parse_content(pre_player['video'], url)
          content = pre_player.get('content') or {}
          info = content.get('info') or {}
  
-        info = {
-            'id': video_id,
+        video_info.update({
              'title': title,
              'description': info.get('synopsis'),
              'series': content.get('title'),
@@ -83,38 +80,7 @@ def _real_extract(self, url):
              'episode': content.get('subtitle'),
              'episode_number': int_or_none(info.get('episode_number')),
              'duration': int_or_none(info.get('duration')),
-            'thumbnail': video.get('dataPoster'),
              'age_limit': int_or_none(info.get('rating')),
              'timestamp': parse_iso8601(pre_player.get('publishedTime')),
-        }
-
-        if video.get('dataCmsId') == 'ooyala':
-            info.update({
-                '_type': 'url_transparent',
-                # for some reason only HLS is supported
-                'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}),
-            })
-        else:
-            config = self._download_json(
-                video['dataConfig'], video_id, 'Downloading config JSON')
-            services = config['services']
-            gbx = self._download_json(
-                services['gbx'], video_id, 'Downloading gbx JSON')
-            caronte = self._download_json(
-                services['caronte'], video_id, 'Downloading caronte JSON')
-            cerbero = self._download_json(
-                caronte['cerbero'], video_id, 'Downloading cerbero JSON',
-                headers={
-                    'Content-Type': 'application/json;charset=UTF-8',
-                    'Origin': 'https://www.mitele.es'
-                },
-                data=json.dumps({
-                    'bbx': caronte['bbx'],
-                    'gbx': gbx['gbx']
-                }).encode('utf-8'))
-            formats = self._extract_m3u8_formats(
-                caronte['dls'][0]['stream'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls',
-                query=dict([cerbero['tokens']['1']['cdn'].split('=', 1)]))
-            info['formats'] = formats
-
-        return info
+        })
+        return video_info
diff --git a/youtube_dlc/extractor/nba.py b/youtube_dlc/extractor/nba.py

index be295a7a3b010c375416e0af1a7d27337ca2d4cf..fbc7adaf46550a0b7acc9b61bd744ea074dcd996 100644 (file)
--- a/youtube_dlc/extractor/nba.py
+++ b/youtube_dlc/extractor/nba.py
@@ -5,33 +5,137 @@
  
  from .turner import TurnerBaseIE
  from ..compat import (
-    compat_urllib_parse_urlencode,
-    compat_urlparse,
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
  )
  from ..utils import (
+    int_or_none,
+    merge_dicts,
      OnDemandPagedList,
-    remove_start,
+    parse_duration,
+    parse_iso8601,
+    try_get,
+    update_url_query,
+    urljoin,
  )
  
  
-class NBAIE(TurnerBaseIE):
-    _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+class NBACVPBaseIE(TurnerBaseIE):
+    def _extract_nba_cvp_info(self, path, video_id, fatal=False):
+        return self._extract_cvp_info(
+            'http://secure.nba.com/%s' % path, video_id, {
+                'default': {
+                    'media_src': 'http://nba.cdn.turner.com/nba/big',
+                },
+                'm3u8': {
+                    'media_src': 'http://nbavod-f.akamaihd.net',
+                },
+            }, fatal=fatal)
+
+
+class NBAWatchBaseIE(NBACVPBaseIE):
+    _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/'
+
+    def _extract_video(self, filter_key, filter_value):
+        video = self._download_json(
+            'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch',
+            filter_value, query={
+                'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName',
+                'q': filter_key + ':' + filter_value,
+                'wt': 'json',
+            })['response']['docs'][0]
+
+        video_id = str(video['pid'])
+        title = video['name']
+
+        formats = []
+        m3u8_url = (self._download_json(
+            'https://watch.nba.com/service/publishpoint', video_id, query={
+                'type': 'video',
+                'format': 'json',
+                'id': video_id,
+            }, headers={
+                'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
+            }, fatal=False) or {}).get('path')
+        if m3u8_url:
+            m3u8_formats = self._extract_m3u8_formats(
+                re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False)
+            formats.extend(m3u8_formats)
+            for f in m3u8_formats:
+                http_f = f.copy()
+                http_f.update({
+                    'format_id': http_f['format_id'].replace('hls-', 'http-'),
+                    'protocol': 'http',
+                    'url': http_f['url'].replace('.m3u8', ''),
+                })
+                formats.append(http_f)
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')),
+            'description': video.get('description'),
+            'duration': int_or_none(video.get('runtime')),
+            'timestamp': parse_iso8601(video.get('releaseDate')),
+            'tags': video.get('tags'),
+        }
+
+        seo_name = video.get('seoName')
+        if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name):
+            base_path = ''
+            if seo_name.startswith('teams/'):
+                base_path += seo_name.split('/')[1] + '/'
+            base_path += 'video/'
+            cvp_info = self._extract_nba_cvp_info(
+                base_path + seo_name + '.xml', video_id, False)
+            if cvp_info:
+                formats.extend(cvp_info['formats'])
+                info = merge_dicts(info, cvp_info)
+
+        self._sort_formats(formats)
+        info['formats'] = formats
+        return info
+
+
+class NBAWatchEmbedIE(NBAWatchBaseIE):
+    IENAME = 'nba:watch:embed'
+    _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://watch.nba.com/embed?id=659395',
+        'md5': 'b7e3f9946595f4ca0a13903ce5edd120',
+        'info_dict': {
+            'id': '659395',
+            'ext': 'mp4',
+            'title': 'Mix clip: More than 7 points of  Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
+            'description': 'Mix clip: More than 7 points of  Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017',
+            'timestamp': 1492228800,
+            'upload_date': '20170415',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self._extract_video('pid', video_id)
+
+
+class NBAWatchIE(NBAWatchBaseIE):
+    IE_NAME = 'nba:watch'
+    _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P<id>.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)'
      _TESTS = [{
          'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
-        'md5': '9e7729d3010a9c71506fd1248f74e4f4',
+        'md5': '9d902940d2a127af3f7f9d2f3dc79c96',
          'info_dict': {
-            'id': '0021200253-okc-bkn-recap',
+            'id': '70946',
              'ext': 'mp4',
              'title': 'Thunder vs. Nets',
              'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
              'duration': 181,
-            'timestamp': 1354638466,
+            'timestamp': 1354597200,
              'upload_date': '20121204',
          },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
      }, {
          'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
          'only_matching': True,
@@ -39,116 +143,286 @@ class NBAIE(TurnerBaseIE):
          'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
          'md5': 'b2b39b81cf28615ae0c3360a3f9668c4',
          'info_dict': {
-            'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+            'id': '330865',
              'ext': 'mp4',
              'title': 'Hawks vs. Cavaliers Game 1',
              'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
              'duration': 228,
-            'timestamp': 1432134543,
-            'upload_date': '20150520',
+            'timestamp': 1432094400,
+            'upload_date': '20150521',
          },
-        'expected_warnings': ['Unable to download f4m manifest'],
      }, {
-        'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
-        'info_dict': {
-            'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324',
-            'ext': 'mp4',
-            'title': 'Practice: Doc Rivers - 2/16/16',
-            'description': 'Head Coach Doc Rivers addresses the media following practice.',
-            'upload_date': '20160216',
-            'timestamp': 1455672000,
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'expected_warnings': ['Unable to download f4m manifest'],
+        'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115',
+        'only_matching': True,
      }, {
-        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
-        'info_dict': {
-            'id': 'timberwolves',
-            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
-        },
-        'playlist_count': 30,
-        'params': {
-            # Download the whole playlist takes too long time
-            'playlist_items': '1-30',
-        },
+        # only CVP mp4 format available
+        'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106',
+        'only_matching': True,
      }, {
-        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+        'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0]
+        if collection_id:
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % display_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id)
+                return self.url_result(
+                    'https://www.nba.com/watch/list/collection/' + collection_id,
+                    NBAWatchCollectionIE.ie_key(), collection_id)
+        return self._extract_video('seoName', display_id)
+
+
+class NBAWatchCollectionIE(NBAWatchBaseIE):
+    IE_NAME = 'nba:watch:collection'
+    _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://watch.nba.com/list/collection/season-preview-2020',
          'info_dict': {
-            'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601',
-            'ext': 'mp4',
-            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
-            'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
-            'upload_date': '20141212',
-            'timestamp': 1418418600,
+            'id': 'season-preview-2020',
          },
-        'params': {
-            'noplaylist': True,
-            # m3u8 download
-            'skip_download': True,
-        },
-        'expected_warnings': ['Unable to download f4m manifest'],
+        'playlist_mincount': 43,
      }]
+    _PAGE_SIZE = 100
  
-    _PAGE_SIZE = 30
+    def _fetch_page(self, collection_id, page):
+        page += 1
+        videos = self._download_json(
+            'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id,
+            collection_id, 'Downloading page %d JSON metadata' % page, query={
+                'count': self._PAGE_SIZE,
+                'page': page,
+            })['results']['videos']
+        for video in videos:
+            program = video.get('program') or {}
+            seo_name = program.get('seoName') or program.get('slug')
+            if not seo_name:
+                continue
+            yield {
+                '_type': 'url',
+                'id': program.get('id'),
+                'title': program.get('title') or video.get('title'),
+                'url': 'https://www.nba.com/watch/video/' + seo_name,
+                'thumbnail': video.get('image'),
+                'description': program.get('description') or video.get('description'),
+                'duration': parse_duration(program.get('runtimeHours')),
+                'timestamp': parse_iso8601(video.get('releaseDate')),
+            }
  
-    def _fetch_page(self, team, video_id, page):
-        search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({
-            'type': 'teamvideo',
-            'start': page * self._PAGE_SIZE + 1,
-            'npp': (page + 1) * self._PAGE_SIZE + 1,
-            'sort': 'recent',
-            'output': 'json',
-            'site': team,
-        })
-        results = self._download_json(
-            search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
-        for item in results:
-            yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
-
-    def _extract_playlist(self, orig_path, video_id, webpage):
-        team = orig_path.split('/')[0]
-
-        if self._downloader.params.get('noplaylist'):
-            self.to_screen('Downloading just video because of --no-playlist')
-            video_path = self._search_regex(
-                r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
-            video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
-            return self.url_result(video_url)
-
-        self.to_screen('Downloading playlist - add --no-playlist to just download video')
-        playlist_title = self._og_search_title(webpage, fatal=False)
+    def _real_extract(self, url):
+        collection_id = self._match_id(url)
          entries = OnDemandPagedList(
-            functools.partial(self._fetch_page, team, video_id),
+            functools.partial(self._fetch_page, collection_id),
              self._PAGE_SIZE)
+        return self.playlist_result(entries, collection_id)
  
-        return self.playlist_result(entries, team, playlist_title)
  
-    def _real_extract(self, url):
-        path, video_id = re.match(self._VALID_URL, url).groups()
-        orig_path = path
-        if path.startswith('nba/'):
-            path = path[3:]
+class NBABaseIE(NBACVPBaseIE):
+    _VALID_URL_BASE = r'''(?x)
+        https?://(?:www\.)?nba\.com/
+            (?P<team>
+                blazers|
+                bucks|
+                bulls|
+                cavaliers|
+                celtics|
+                clippers|
+                grizzlies|
+                hawks|
+                heat|
+                hornets|
+                jazz|
+                kings|
+                knicks|
+                lakers|
+                magic|
+                mavericks|
+                nets|
+                nuggets|
+                pacers|
+                pelicans|
+                pistons|
+                raptors|
+                rockets|
+                sixers|
+                spurs|
+                suns|
+                thunder|
+                timberwolves|
+                warriors|
+                wizards
+            )
+        (?:/play\#)?/'''
+    _CHANNEL_PATH_REGEX = r'video/channel|series'
  
-        if 'video/' not in path:
-            webpage = self._download_webpage(url, video_id)
-            path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+    def _embed_url_result(self, team, content_id):
+        return self.url_result(update_url_query(
+            'https://secure.nba.com/assets/amp/include/video/iframe.html', {
+                'contentId': content_id,
+                'team': team,
+            }), NBAEmbedIE.ie_key())
  
-            if path == '{{id}}':
-                return self._extract_playlist(orig_path, video_id, webpage)
+    def _call_api(self, team, content_id, query, resource):
+        return self._download_json(
+            'https://api.nba.net/2/%s/video,imported_video,wsc/' % team,
+            content_id, 'Download %s JSON metadata' % resource,
+            query=query, headers={
+                'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b',
+            })['response']['result']
  
-            # See prepareContentId() of pkgCvp.js
-            if path.startswith('video/teams'):
-                path = 'video/channels/proxy/' + path[6:]
+    def _extract_video(self, video, team, extract_all=True):
+        video_id = compat_str(video['nid'])
+        team = video['brand']
  
-        return self._extract_cvp_info(
-            'http://www.nba.com/%s.xml' % path, video_id, {
-                'default': {
-                    'media_src': 'http://nba.cdn.turner.com/nba/big',
-                },
-                'm3u8': {
-                    'media_src': 'http://nbavod-f.akamaihd.net',
-                },
+        info = {
+            'id': video_id,
+            'title': video.get('title') or video.get('headline') or video['shortHeadline'],
+            'description': video.get('description'),
+            'timestamp': parse_iso8601(video.get('published')),
+        }
+
+        subtitles = {}
+        captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {}
+        for caption_url in captions.values():
+            subtitles.setdefault('en', []).append({'url': caption_url})
+
+        formats = []
+        mp4_url = video.get('mp4')
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
              })
+
+        if extract_all:
+            source_url = video.get('videoSource')
+            if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'):
+                formats.append({
+                    'format_id': 'source',
+                    'url': source_url,
+                    'preference': 1,
+                })
+
+            m3u8_url = video.get('m3u8')
+            if m3u8_url:
+                if '.akamaihd.net/i/' in m3u8_url:
+                    formats.extend(self._extract_akamai_formats(
+                        m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'}))
+                else:
+                    formats.extend(self._extract_m3u8_formats(
+                        m3u8_url, video_id, 'mp4',
+                        'm3u8_native', m3u8_id='hls', fatal=False))
+
+            content_xml = video.get('contentXml')
+            if team and content_xml:
+                cvp_info = self._extract_nba_cvp_info(
+                    team + content_xml, video_id, fatal=False)
+                if cvp_info:
+                    formats.extend(cvp_info['formats'])
+                    subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles'])
+                    info = merge_dicts(info, cvp_info)
+
+            self._sort_formats(formats)
+        else:
+            info.update(self._embed_url_result(team, video['videoId']))
+
+        info.update({
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+
+        return info
+
+    def _real_extract(self, url):
+        team, display_id = re.match(self._VALID_URL, url).groups()
+        if '/play#/' in url:
+            display_id = compat_urllib_parse_unquote(display_id)
+        else:
+            webpage = self._download_webpage(url, display_id)
+            display_id = self._search_regex(
+                self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id')
+        return self._extract_url_results(team, display_id)
+
+
+class NBAEmbedIE(NBABaseIE):
+    IENAME = 'nba:embed'
+    _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P<id>[^?#&]+)'
+    _TESTS = [{
+        'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&ampEnv=',
+        'only_matching': True,
+    }, {
+        'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        content_id = qs['contentId'][0]
+        team = qs.get('team', [None])[0]
+        if not team:
+            return self.url_result(
+                'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key())
+        video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0]
+        return self._extract_video(video, team)
+
+
+class NBAIE(NBABaseIE):
+    IENAME = 'nba'
+    _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P<id>(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
+    _TESTS = [{
+        'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774',
+        'info_dict': {
+            'id': '45039',
+            'ext': 'mp4',
+            'title': 'AND WE BACK.',
+            'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.',
+            'duration': 94,
+            'timestamp': 1607112000,
+            'upload_date': '20201218',
+        },
+    }, {
+        'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0',
+        'only_matching': True,
+    }]
+    _CONTENT_ID_REGEX = r'videoID'
+
+    def _extract_url_results(self, team, content_id):
+        return self._embed_url_result(team, content_id)
+
+
+class NBAChannelIE(NBABaseIE):
+    IENAME = 'nba:channel'
+    _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P<id>[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX
+    _TESTS = [{
+        'url': 'https://www.nba.com/blazers/video/channel/summer_league',
+        'info_dict': {
+            'title': 'Summer League',
+        },
+        'playlist_mincount': 138,
+    }, {
+        'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date',
+        'only_matching': True,
+    }]
+    _CONTENT_ID_REGEX = r'videoSubCategory'
+    _PAGE_SIZE = 100
+
+    def _fetch_page(self, team, channel, page):
+        results = self._call_api(team, channel, {
+            'channels': channel,
+            'count': self._PAGE_SIZE,
+            'offset': page * self._PAGE_SIZE,
+        }, 'page %d' % (page + 1))
+        for video in results:
+            yield self._extract_video(video, team, False)
+
+    def _extract_url_results(self, team, content_id):
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, team, content_id),
+            self._PAGE_SIZE)
+        return self.playlist_result(entries, playlist_title=content_id)
diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py

index ea5f5a3157c3ac068a7236e9c6251c00b1017231..0d77648c2d3748d8d2814cf1ad8c7dae2d4e2393 100644 (file)
--- a/youtube_dlc/extractor/nbc.py
+++ b/youtube_dlc/extractor/nbc.py
@@ -158,7 +158,8 @@ def _real_extract(self, url):
  
  
  class NBCSportsVPlayerIE(InfoExtractor):
-    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+    _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/'
+    _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
  
      _TESTS = [{
          'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
@@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor):
      }, {
          'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z',
          'only_matching': True,
+    }, {
+        'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true',
+        'only_matching': True,
      }]
  
      @staticmethod
      def _extract_url(webpage):
          iframe_m = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+            r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P<url>%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage)
          if iframe_m:
              return iframe_m.group('url')
  
@@ -192,21 +196,29 @@ def _real_extract(self, url):
  
  
  class NBCSportsIE(InfoExtractor):
-    # Does not include https because its certificate is invalid
-    _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+    _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
  
-    _TEST = {
+    _TESTS = [{
+        # iframe src
          'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
          'info_dict': {
              'id': 'PHJSaFWbrTY9',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
              'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
              'uploader': 'NBCU-SPORTS',
              'upload_date': '20150330',
              'timestamp': 1427726529,
          }
-    }
+    }, {
+        # data-mpx-src
+        'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot',
+        'only_matching': True,
+    }, {
+        # data-src
+        'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
@@ -274,33 +286,6 @@ def _real_extract(self, url):
          }
  
  
-class CSNNEIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P<id>[0-9a-z-]+)'
-
-    _TEST = {
-        'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter',
-        'info_dict': {
-            'id': 'yvBLLUgQ8WU0',
-            'ext': 'mp4',
-            'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.',
-            'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3',
-            'timestamp': 1459369979,
-            'upload_date': '20160330',
-            'uploader': 'NBCU-SPORTS',
-        }
-    }
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        return {
-            '_type': 'url_transparent',
-            'ie_key': 'ThePlatform',
-            'url': self._html_search_meta('twitter:player:stream', webpage),
-            'display_id': display_id,
-        }
-
-
  class NBCNewsIE(ThePlatformIE):
      _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
  
diff --git a/youtube_dlc/extractor/nfl.py b/youtube_dlc/extractor/nfl.py

index 460deb162df7994caa389b1f37c4174cec3fbf78..871923e4c66afd4076db3022e59f502bba4881c4 100644 (file)
--- a/youtube_dlc/extractor/nfl.py
+++ b/youtube_dlc/extractor/nfl.py
@@ -4,19 +4,15 @@
  import re
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_urlparse,
-)
  from ..utils import (
-    ExtractorError,
-    int_or_none,
-    remove_end,
+    clean_html,
+    determine_ext,
+    get_element_by_class,
  )
  
  
-class NFLIE(InfoExtractor):
-    IE_NAME = 'nfl.com'
-    _VALID_URL = r'''(?x)
+class NFLBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'''(?x)
                      https?://
                          (?P<host>
                              (?:www\.)?
@@ -34,15 +30,15 @@ class NFLIE(InfoExtractor):
                                      houstontexans|
                                      colts|
                                      jaguars|
-                                    titansonline|
+                                    (?:titansonline|tennesseetitans)|
                                      denverbroncos|
-                                    kcchiefs|
+                                    (?:kc)?chiefs|
                                      raiders|
                                      chargers|
                                      dallascowboys|
                                      giants|
                                      philadelphiaeagles|
-                                    redskins|
+                                    (?:redskins|washingtonfootball)|
                                      chicagobears|
                                      detroitlions|
                                      packers|
@@ -52,180 +48,113 @@ class NFLIE(InfoExtractor):
                                      neworleanssaints|
                                      buccaneers|
                                      azcardinals|
-                                    stlouisrams|
+                                    (?:stlouis|the)rams|
                                      49ers|
                                      seahawks
                                  )\.com|
                                  .+?\.clubs\.nfl\.com
                              )
                          )/
-                        (?:.+?/)*
-                        (?P<id>[^/#?&]+)
                      '''
+    _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})'
+    _WORKING = False
+
+    def _parse_video_config(self, video_config, display_id):
+        video_config = self._parse_json(video_config, display_id)
+        item = video_config['playlist'][0]
+        mcp_id = item.get('mcpID')
+        if mcp_id:
+            info = self.url_result(
+                'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id,
+                'Anvato', mcp_id)
+        else:
+            media_id = item.get('id') or item['entityId']
+            title = item['title']
+            item_url = item['url']
+            info = {'id': media_id}
+            ext = determine_ext(item_url)
+            if ext == 'm3u8':
+                info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4')
+                self._sort_formats(info['formats'])
+            else:
+                info['url'] = item_url
+                if item.get('audio') is True:
+                    info['vcodec'] = 'none'
+            is_live = video_config.get('live') is True
+            thumbnails = None
+            image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage'))
+            if image_url:
+                thumbnails = [{
+                    'url': image_url,
+                    'ext': determine_ext(image_url, 'jpg'),
+                }]
+            info.update({
+                'title': self._live_title(title) if is_live else title,
+                'is_live': is_live,
+                'description': clean_html(item.get('description')),
+                'thumbnails': thumbnails,
+            })
+        return info
+
+
+class NFLIE(NFLBaseIE):
+    IE_NAME = 'nfl.com'
+    _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'(?:videos?|listen|audio)/(?P<id>[^/#?&]+)'
      _TESTS = [{
-        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
-        'md5': '394ef771ddcd1354f665b471d78ec4c6',
+        'url': 'https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14',
          'info_dict': {
-            'id': '0ap3000000398478',
+            'id': '899441',
              'ext': 'mp4',
-            'title': 'Week 3: Redskins vs. Eagles highlights',
-            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
-            'upload_date': '20140921',
-            'timestamp': 1411337580,
+            'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14",
+            'description': 'md5:85e05a3cc163f8c344340f220521136d',
+            'upload_date': '20201215',
+            'timestamp': 1608009755,
              'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'NFL',
          }
      }, {
-        'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
-        'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+        'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown',
+        'md5': '6886b32c24b463038c760ceb55a34566',
          'info_dict': {
-            'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
-            'ext': 'mp4',
-            'title': 'LIVE: Post Game vs. Browns',
-            'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
-            'upload_date': '20131229',
-            'timestamp': 1388354455,
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99',
+            'ext': 'mp3',
+            'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown',
+            'description': 'md5:12ada8ee70e6762658c30e223e095075',
          }
      }, {
-        'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
-        'info_dict': {
-            'id': '0ap3000000467607',
-            'ext': 'mp4',
-            'title': 'Frustrations flare on the field',
-            'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
-            'timestamp': 1422850320,
-            'upload_date': '20150202',
-        },
-    }, {
-        'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',
-        'md5': '4c319e2f625ffd0b481b4382c6fc124c',
-        'info_dict': {
-            'id': 'n-238346',
-            'ext': 'mp4',
-            'title': '10 Days at Gillette',
-            'description': 'md5:8cd9cd48fac16de596eadc0b24add951',
-            'timestamp': 1442618809,
-            'upload_date': '20150918',
-        },
-    }, {
-        # lowercase data-contentid
-        'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
-        'info_dict': {
-            'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
-            'ext': 'mp4',
-            'title': 'Tomlin looks ahead to Ravens on a short week',
-            'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
-            'timestamp': 1443459651,
-            'upload_date': '20150928',
-        },
-        'params': {
-            'skip_download': True,
-        },
-    }, {
-        'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+        'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14',
          'only_matching': True,
      }, {
-        'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',
+        'url': 'https://www.raiders.com/audio/instant-reactions-raiders-week-14-loss-to-indianapolis-colts-espn-jason-fitz',
          'only_matching': True,
      }]
  
-    @staticmethod
-    def prepend_host(host, url):
-        if not url.startswith('http'):
-            if not url.startswith('/'):
-                url = '/%s' % url
-            url = 'http://{0:}{1:}'.format(host, url)
-        return url
-
-    @staticmethod
-    def format_from_stream(stream, protocol, host, path_prefix='',
-                           preference=0, note=None):
-        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
-            protocol=protocol,
-            host=host,
-            prefix=path_prefix,
-            path=stream.get('path'),
-        )
-        return {
-            'url': url,
-            'vbr': int_or_none(stream.get('rate', 0), 1000),
-            'preference': preference,
-            'format_note': note,
-        }
-
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id, host = mobj.group('id'), mobj.group('host')
-
-        webpage = self._download_webpage(url, video_id)
-
-        config_url = NFLIE.prepend_host(host, self._search_regex(
-            r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1',
-            webpage, 'config URL', default='static/content/static/config/video/config.json',
-            group='config'))
-        # For articles, the id in the url is not the video id
-        video_id = self._search_regex(
-            r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>(?:(?!\1).)+)\1',
-            webpage, 'video id', default=video_id, group='id')
-        config = self._download_json(config_url, video_id, 'Downloading player config')
-        url_template = NFLIE.prepend_host(
-            host, '{contentURLTemplate:}'.format(**config))
-        video_data = self._download_json(
-            url_template.format(id=video_id), video_id)
-
-        formats = []
-        cdn_data = video_data.get('cdnData', {})
-        streams = cdn_data.get('bitrateInfo', [])
-        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
-            parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))
-            protocol, host = parts.scheme, parts.netloc
-            for stream in streams:
-                formats.append(
-                    NFLIE.format_from_stream(stream, protocol, host))
-        else:
-            cdns = config.get('cdns')
-            if not cdns:
-                raise ExtractorError('Failed to get CDN data', expected=True)
-
-            for name, cdn in cdns.items():
-                # LimeLight streams don't seem to work
-                if cdn.get('name') == 'LIMELIGHT':
-                    continue
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        return self._parse_video_config(self._search_regex(
+            self._VIDEO_CONFIG_REGEX, webpage, 'video config'), display_id)
  
-                protocol = cdn.get('protocol')
-                host = remove_end(cdn.get('host', ''), '/')
-                if not (protocol and host):
-                    continue
  
-                prefix = cdn.get('pathprefix', '')
-                if prefix and not prefix.endswith('/'):
-                    prefix = '%s/' % prefix
-
-                preference = 0
-                if protocol == 'rtmp':
-                    preference = -2
-                elif 'prog' in name.lower():
-                    preference = 1
-
-                for stream in streams:
-                    formats.append(
-                        NFLIE.format_from_stream(stream, protocol, host,
-                                                 prefix, preference, name))
-
-        self._sort_formats(formats)
-
-        thumbnail = None
-        for q in ('xl', 'l', 'm', 's', 'xs'):
-            thumbnail = video_data.get('imagePaths', {}).get(q)
-            if thumbnail:
-                break
+class NFLArticleIE(NFLBaseIE):
+    IE_NAME = 'nfl.com:article'
+    _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'news/(?P<id>[^/#?&]+)'
+    _TEST = {
+        'url': 'https://www.buffalobills.com/news/the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e',
+        'info_dict': {
+            'id': 'the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e',
+            'title': "'The only thing we've earned is the noise' | Bills coaches discuss handling rising expectations",
+        },
+        'playlist_count': 4,
+    }
  
-        return {
-            'id': video_id,
-            'title': video_data.get('headline'),
-            'formats': formats,
-            'description': video_data.get('caption'),
-            'duration': video_data.get('duration'),
-            'thumbnail': thumbnail,
-            'timestamp': int_or_none(video_data.get('posted'), 1000),
-        }
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        entries = []
+        for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage):
+            entries.append(self._parse_video_config(video_config, display_id))
+        title = clean_html(get_element_by_class(
+            'nfl-c-article__title', webpage)) or self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage)
+        return self.playlist_result(entries, display_id, title)
diff --git a/youtube_dlc/extractor/nhk.py b/youtube_dlc/extractor/nhk.py

index de6a707c4265c4fc61a57db117a432a95468ab54..8a9331a79f24b227db941760cd5274a8bcaa1827 100644 (file)
--- a/youtube_dlc/extractor/nhk.py
+++ b/youtube_dlc/extractor/nhk.py
@@ -3,51 +3,33 @@
  import re
  
  from .common import InfoExtractor
+from ..utils import urljoin
  
  
-class NhkVodIE(InfoExtractor):
-    _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)'
-    # Content available only for a limited period of time. Visit
-    # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
-    _TESTS = [{
-        # clip
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
-        'md5': '256a1be14f48d960a7e61e2532d95ec3',
-        'info_dict': {
-            'id': 'a95j5iza',
-            'ext': 'mp4',
-            'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
-            'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
-            'timestamp': 1565965194,
-            'upload_date': '20190816',
-        },
-    }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
-        'only_matching': True,
-    }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
-        'only_matching': True,
-    }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
-        'only_matching': True,
-    }, {
-        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
-        'only_matching': True,
-    }]
-    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json'
+class NhkBaseIE(InfoExtractor):
+    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json'
+    _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
+    _TYPE_REGEX = r'/(?P<type>video|audio)/'
  
-    def _real_extract(self, url):
-        lang, m_type, episode_id = re.match(self._VALID_URL, url).groups()
+    def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
+        return self._download_json(
+            self._API_URL_TEMPLATE % (
+                'v' if is_video else 'r',
+                'clip' if is_clip else 'esd',
+                'episode' if is_episode else 'program',
+                m_id, lang, '/all' if is_video else ''),
+            m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or []
+
+    def _extract_episode_info(self, url, episode=None):
+        fetch_episode = episode is None
+        lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups()
          if episode_id.isdigit():
              episode_id = episode_id[:4] + '-' + episode_id[4:]
  
          is_video = m_type == 'video'
-        episode = self._download_json(
-            self._API_URL_TEMPLATE % (
-                'v' if is_video else 'r',
-                'clip' if episode_id[:4] == '9999' else 'esd',
-                episode_id, lang, '/all' if is_video else ''),
-            episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0]
+        if fetch_episode:
+            episode = self._call_api(
+                episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
          title = episode.get('sub_title_clean') or episode['sub_title']
  
          def get_clean_field(key):
@@ -76,18 +58,121 @@ def get_clean_field(key):
              'episode': title,
          }
          if is_video:
+            vod_id = episode['vod_id']
              info.update({
                  '_type': 'url_transparent',
                  'ie_key': 'Piksel',
-                'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'],
+                'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id,
+                'id': vod_id,
              })
          else:
-            audio = episode['audio']
-            audio_path = audio['audio']
-            info['formats'] = self._extract_m3u8_formats(
-                'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
-                episode_id, 'm4a', entry_protocol='m3u8_native',
-                m3u8_id='hls', fatal=False)
-            for f in info['formats']:
-                f['language'] = lang
+            if fetch_episode:
+                audio_path = episode['audio']['audio']
+                info['formats'] = self._extract_m3u8_formats(
+                    'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
+                    episode_id, 'm4a', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False)
+                for f in info['formats']:
+                    f['language'] = lang
+            else:
+                info.update({
+                    '_type': 'url_transparent',
+                    'ie_key': NhkVodIE.ie_key(),
+                    'url': url,
+                })
          return info
+
+
+class NhkVodIE(NhkBaseIE):
+    _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    # Content available only for a limited period of time. Visit
+    # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
+    _TESTS = [{
+        # video clip
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
+        'md5': '7a90abcfe610ec22a6bfe15bd46b30ca',
+        'info_dict': {
+            'id': 'a95j5iza',
+            'ext': 'mp4',
+            'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
+            'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
+            'timestamp': 1565965194,
+            'upload_date': '20190816',
+        },
+    }, {
+        # audio clip
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/',
+        'info_dict': {
+            'id': 'r_inventions-20201104-1-en',
+            'ext': 'm4a',
+            'title': "Japan's Top Inventions - Miniature Video Cameras",
+            'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        return self._extract_episode_info(url)
+
+
+class NhkVodProgramIE(NhkBaseIE):
+    _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX)
+    _TESTS = [{
+        # video program episodes
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
+        'info_dict': {
+            'id': 'japanrailway',
+            'title': 'Japan Railway Journal',
+        },
+        'playlist_mincount': 1,
+    }, {
+        # video program clips
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
+        'info_dict': {
+            'id': 'japanrailway',
+            'title': 'Japan Railway Journal',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
+        'only_matching': True,
+    }, {
+        # audio program
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups()
+
+        episodes = self._call_api(
+            program_id, lang, m_type == 'video', False, episode_type == 'clip')
+
+        entries = []
+        for episode in episodes:
+            episode_path = episode.get('url')
+            if not episode_path:
+                continue
+            entries.append(self._extract_episode_info(
+                urljoin(url, episode_path), episode))
+
+        program_title = None
+        if entries:
+            program_title = entries[0].get('series')
+
+        return self.playlist_result(entries, program_id, program_title)
diff --git a/youtube_dlc/extractor/niconico.py b/youtube_dlc/extractor/niconico.py

index eb07ca7765e6ccfe08f856f44647f1ea7d7f706f..a85fc3d5c9dd33c1770454372181d89ef2e55b5c 100644 (file)
--- a/youtube_dlc/extractor/niconico.py
+++ b/youtube_dlc/extractor/niconico.py
@@ -1,20 +1,23 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-import json
  import datetime
+import functools
+import json
+import math
  
  from .common import InfoExtractor
  from ..compat import (
      compat_parse_qs,
-    compat_urlparse,
+    compat_urllib_parse_urlparse,
  )
  from ..utils import (
      determine_ext,
      dict_get,
      ExtractorError,
-    int_or_none,
      float_or_none,
+    InAdvancePagedList,
+    int_or_none,
      parse_duration,
      parse_iso8601,
      remove_start,
@@ -181,7 +184,7 @@ def _login(self):
          if urlh is False:
              login_ok = False
          else:
-            parts = compat_urlparse.urlparse(urlh.geturl())
+            parts = compat_urllib_parse_urlparse(urlh.geturl())
              if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
                  login_ok = False
          if not login_ok:
@@ -292,7 +295,7 @@ def _format_id_from_url(video_url):
                  'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
                  video_id, 'Downloading flv info')
  
-            flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+            flv_info = compat_parse_qs(flv_info_webpage)
              if 'url' not in flv_info:
                  if 'deleted' in flv_info:
                      raise ExtractorError('The video has been deleted.',
@@ -437,34 +440,76 @@ def get_video_info(items):
  
  
  class NiconicoPlaylistIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.nicovideo.jp/mylist/27411728',
          'info_dict': {
              'id': '27411728',
              'title': 'AKB48のオールナイトニッポン',
+            'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08',
+            'uploader': 'のっく',
+            'uploader_id': '805442',
          },
          'playlist_mincount': 225,
-    }
+    }, {
+        'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728',
+        'only_matching': True,
+    }]
+    _PAGE_SIZE = 100
+
+    def _call_api(self, list_id, resource, query):
+        return self._download_json(
+            'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
+            'Downloading %s JSON metatdata' % resource, query=query,
+            headers={'X-Frontend-Id': 6})['data']['mylist']
+
+    def _parse_owner(self, item):
+        owner = item.get('owner') or {}
+        if owner:
+            return {
+                'uploader': owner.get('name'),
+                'uploader_id': owner.get('id'),
+            }
+        return {}
+
+    def _fetch_page(self, list_id, page):
+        page += 1
+        items = self._call_api(list_id, 'page %d' % page, {
+            'page': page,
+            'pageSize': self._PAGE_SIZE,
+        })['items']
+        for item in items:
+            video = item.get('video') or {}
+            video_id = video.get('id')
+            if not video_id:
+                continue
+            count = video.get('count') or {}
+            get_count = lambda x: int_or_none(count.get(x))
+            info = {
+                '_type': 'url',
+                'id': video_id,
+                'title': video.get('title'),
+                'url': 'https://www.nicovideo.jp/watch/' + video_id,
+                'description': video.get('shortDescription'),
+                'duration': int_or_none(video.get('duration')),
+                'view_count': get_count('view'),
+                'comment_count': get_count('comment'),
+                'ie_key': NiconicoIE.ie_key(),
+            }
+            info.update(self._parse_owner(video))
+            yield info
  
      def _real_extract(self, url):
          list_id = self._match_id(url)
-        webpage = self._download_webpage(url, list_id)
-
-        entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
-                                          webpage, 'entries')
-        entries = json.loads(entries_json)
-        entries = [{
-            '_type': 'url',
-            'ie_key': NiconicoIE.ie_key(),
-            'url': ('http://www.nicovideo.jp/watch/%s' %
-                    entry['item_data']['video_id']),
-        } for entry in entries]
-
-        return {
-            '_type': 'playlist',
-            'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
-            'id': list_id,
-            'entries': entries,
-        }
+        mylist = self._call_api(list_id, 'list', {
+            'pageSize': 1,
+        })
+        entries = InAdvancePagedList(
+            functools.partial(self._fetch_page, list_id),
+            math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE),
+            self._PAGE_SIZE)
+        result = self.playlist_result(
+            entries, list_id, mylist.get('name'), mylist.get('description'))
+        result.update(self._parse_owner(mylist))
+        return result
diff --git a/youtube_dlc/extractor/ninecninemedia.py b/youtube_dlc/extractor/ninecninemedia.py

index 65754c5e703acfc38e075f5bc974b5d4815c12d2..a569c889e7729fc5302eee4e6759f0be6b457670 100644 (file)
--- a/youtube_dlc/extractor/ninecninemedia.py
+++ b/youtube_dlc/extractor/ninecninemedia.py
@@ -5,10 +5,11 @@
  
  from .common import InfoExtractor
  from ..utils import (
-    parse_iso8601,
-    float_or_none,
      ExtractorError,
+    float_or_none,
      int_or_none,
+    parse_iso8601,
+    try_get,
  )
  
  
@@ -35,7 +36,7 @@ def _real_extract(self, url):
                  '$include': '[HasClosedCaptions]',
              })
  
-        if content_package.get('Constraints', {}).get('Security', {}).get('Type'):
+        if try_get(content_package, lambda x: x['Constraints']['Security']['Type']):
              raise ExtractorError('This video is DRM protected.', expected=True)
  
          manifest_base_url = content_package_url + 'manifest.'
@@ -52,7 +53,7 @@ def _real_extract(self, url):
          self._sort_formats(formats)
  
          thumbnails = []
-        for image in content.get('Images', []):
+        for image in (content.get('Images') or []):
              image_url = image.get('Url')
              if not image_url:
                  continue
@@ -70,7 +71,7 @@ def _real_extract(self, url):
                      continue
                  container.append(e_name)
  
-        season = content.get('Season', {})
+        season = content.get('Season') or {}
  
          info = {
              'id': content_id,
@@ -79,13 +80,14 @@ def _real_extract(self, url):
              'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
              'episode_number': int_or_none(content.get('Episode')),
              'season': season.get('Name'),
-            'season_number': season.get('Number'),
+            'season_number': int_or_none(season.get('Number')),
              'season_id': season.get('Id'),
-            'series': content.get('Media', {}).get('Name'),
+            'series': try_get(content, lambda x: x['Media']['Name']),
              'tags': tags,
              'categories': categories,
              'duration': float_or_none(content_package.get('Duration')),
              'formats': formats,
+            'thumbnails': thumbnails,
          }
  
          if content_package.get('HasClosedCaptions'):
diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py

index 4a395546f67331d4e9b32073ba3ed576b0c0c468..69178e1579c5de93e128c4419cc2ff7f95270854 100644 (file)
--- a/youtube_dlc/extractor/nrk.py
+++ b/youtube_dlc/extractor/nrk.py
@@ -1,28 +1,67 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import itertools
+import random
  import re
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_str,
-    compat_urllib_parse_unquote,
-)
+from ..compat import compat_str
  from ..utils import (
      determine_ext,
      ExtractorError,
      int_or_none,
-    js_to_json,
-    NO_DEFAULT,
-    parse_age_limit,
      parse_duration,
+    str_or_none,
      try_get,
+    urljoin,
      url_or_none,
  )
  
  
  class NRKBaseIE(InfoExtractor):
      _GEO_COUNTRIES = ['NO']
+    _CDN_REPL_REGEX = r'''(?x)://
+        (?:
+            nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0|
+            nrk-od-no\.telenorcdn\.net|
+            minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no
+        )/'''
+
+    def _extract_nrk_formats(self, asset_url, video_id):
+        if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url):
+            return self._extract_akamai_formats(asset_url, video_id)
+        asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url)
+        formats = self._extract_m3u8_formats(
+            asset_url, video_id, 'mp4', 'm3u8_native', fatal=False)
+        if not formats and re.search(self._CDN_REPL_REGEX, asset_url):
+            formats = self._extract_m3u8_formats(
+                re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url),
+                video_id, 'mp4', 'm3u8_native', fatal=False)
+        return formats
+
+    def _raise_error(self, data):
+        MESSAGES = {
+            'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
+            'ProgramRightsHasExpired': 'Programmet har gått ut',
+            'NoProgramRights': 'Ikke tilgjengelig',
+            'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+        }
+        message_type = data.get('messageType', '')
+        # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
+        if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True:
+            self.raise_geo_restricted(
+                msg=MESSAGES.get('ProgramIsGeoBlocked'),
+                countries=self._GEO_COUNTRIES)
+        message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type)
+        raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+
+    def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None):
+        return self._download_json(
+            urljoin('http://psapi.nrk.no/', path),
+            video_id, note or 'Downloading %s JSON' % item,
+            fatal=fatal, query=query,
+            headers={'Accept-Encoding': 'gzip, deflate, br'})
  
  
  class NRKIE(NRKBaseIE):
@@ -41,7 +80,7 @@ class NRKIE(NRKBaseIE):
      _TESTS = [{
          # video
          'url': 'http://www.nrk.no/video/PS*150533',
-        'md5': '706f34cdf1322577589e369e522b50ef',
+        'md5': 'f46be075326e23ad0e524edfcb06aeb6',
          'info_dict': {
              'id': '150533',
              'ext': 'mp4',
@@ -55,7 +94,7 @@ class NRKIE(NRKBaseIE):
          # MD5 is unstable
          'info_dict': {
              'id': '154915',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Slik høres internett ut når du er blind',
              'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
              'duration': 20,
@@ -75,12 +114,50 @@ class NRKIE(NRKBaseIE):
      }, {
          'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999',
          'only_matching': True,
+    }, {
+        # podcast
+        'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
+        'only_matching': True,
+    }, {
+        'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
+        'only_matching': True,
+    }, {
+        # clip
+        'url': 'nrk:150533',
+        'only_matching': True,
+    }, {
+        'url': 'nrk:clip/150533',
+        'only_matching': True,
+    }, {
+        # program
+        'url': 'nrk:MDDP12000117',
+        'only_matching': True,
+    }, {
+        'url': 'nrk:program/ENRK10100318',
+        'only_matching': True,
+    }, {
+        # direkte
+        'url': 'nrk:nrk1',
+        'only_matching': True,
+    }, {
+        'url': 'nrk:channel/nrk1',
+        'only_matching': True,
      }]
  
-    def _extract_from_playback(self, video_id):
-        manifest = self._download_json(
-            'http://psapi.nrk.no/playback/manifest/%s' % video_id,
-            video_id, 'Downloading manifest JSON')
+    def _real_extract(self, url):
+        video_id = self._match_id(url).split('/')[-1]
+
+        path_templ = 'playback/%s/' + video_id
+
+        def call_playback_api(item, query=None):
+            return self._call_api(path_templ % item, video_id, item, query=query)
+        # known values for preferredCdn: akamai, iponly, minicdn and telenor
+        manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'})
+
+        video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id
+
+        if manifest.get('playability') == 'nonPlayable':
+            self._raise_error(manifest['nonPlayable'])
  
          playable = manifest['playable']
  
@@ -93,15 +170,18 @@ def _extract_from_playback(self, video_id):
              format_url = url_or_none(asset.get('url'))
              if not format_url:
                  continue
-            if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
+            asset_format = (asset.get('format') or '').lower()
+            if asset_format == 'hls' or determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_nrk_formats(format_url, video_id))
+            elif asset_format == 'mp3':
+                formats.append({
+                    'url': format_url,
+                    'format_id': asset_format,
+                    'vcodec': 'none',
+                })
          self._sort_formats(formats)
  
-        data = self._download_json(
-            'http://psapi.nrk.no/playback/metadata/%s' % video_id,
-            video_id, 'Downloading metadata JSON')
+        data = call_playback_api('metadata')
  
          preplay = data['preplay']
          titles = preplay['titles']
@@ -125,67 +205,125 @@ def _extract_from_playback(self, video_id):
                  'height': int_or_none(image.get('pixelHeight')),
              })
  
-        return {
+        subtitles = {}
+        for sub in try_get(playable, lambda x: x['subtitles'], list) or []:
+            if not isinstance(sub, dict):
+                continue
+            sub_url = url_or_none(sub.get('webVtt'))
+            if not sub_url:
+                continue
+            sub_key = str_or_none(sub.get('language')) or 'nb'
+            sub_type = str_or_none(sub.get('type'))
+            if sub_type:
+                sub_key += '-%s' % sub_type
+            subtitles.setdefault(sub_key, []).append({
+                'url': sub_url,
+            })
+
+        legal_age = try_get(
+            data, lambda x: x['legalAge']['body']['rating']['code'], compat_str)
+        # https://en.wikipedia.org/wiki/Norwegian_Media_Authority
+        if legal_age == 'A':
+            age_limit = 0
+        elif legal_age.isdigit():
+            age_limit = int_or_none(legal_age)
+        else:
+            age_limit = None
+
+        is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series'
+
+        info = {
              'id': video_id,
              'title': title,
              'alt_title': alt_title,
              'description': description,
              'duration': duration,
              'thumbnails': thumbnails,
+            'age_limit': age_limit,
              'formats': formats,
+            'subtitles': subtitles,
          }
  
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        return self._extract_from_playback(video_id)
+        if is_series:
+            series = season_id = season_number = episode = episode_number = None
+            programs = self._call_api(
+                'programs/%s' % video_id, video_id, 'programs', fatal=False)
+            if programs and isinstance(programs, dict):
+                series = str_or_none(programs.get('seriesTitle'))
+                season_id = str_or_none(programs.get('seasonId'))
+                season_number = int_or_none(programs.get('seasonNumber'))
+                episode = str_or_none(programs.get('episodeTitle'))
+                episode_number = int_or_none(programs.get('episodeNumber'))
+            if not series:
+                series = title
+            if alt_title:
+                title += ' - %s' % alt_title
+            if not season_number:
+                season_number = int_or_none(self._search_regex(
+                    r'Sesong\s+(\d+)', description or '', 'season number',
+                    default=None))
+            if not episode:
+                episode = alt_title if is_series else None
+            if not episode_number:
+                episode_number = int_or_none(self._search_regex(
+                    r'^(\d+)\.', episode or '', 'episode number',
+                    default=None))
+            if not episode_number:
+                episode_number = int_or_none(self._search_regex(
+                    r'\((\d+)\s*:\s*\d+\)', description or '',
+                    'episode number', default=None))
+            info.update({
+                'title': title,
+                'series': series,
+                'season_id': season_id,
+                'season_number': season_number,
+                'episode': episode,
+                'episode_number': episode_number,
+            })
+
+        return info
  
  
-class NRKTVIE(NRKBaseIE):
+class NRKTVIE(InfoExtractor):
      IE_DESC = 'NRK TV and NRK Radio'
      _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})'
-    _VALID_URL = r'''(?x)
-                        https?://
-                            (?:tv|radio)\.nrk(?:super)?\.no/
-                            (?:serie(?:/[^/]+){1,2}|program)/
-                            (?![Ee]pisodes)%s
-                            (?:/\d{2}-\d{2}-\d{4})?
-                            (?:\#del=(?P<part_id>\d+))?
-                    ''' % _EPISODE_RE
-    _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE
      _TESTS = [{
          'url': 'https://tv.nrk.no/program/MDDP12000117',
-        'md5': '8270824df46ec629b66aeaa5796b36fb',
+        'md5': 'c4a5960f1b00b40d47db65c1064e0ab1',
          'info_dict': {
-            'id': 'MDDP12000117AA',
+            'id': 'MDDP12000117',
              'ext': 'mp4',
              'title': 'Alarm Trolltunga',
              'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
-            'duration': 2223,
+            'duration': 2223.44,
              'age_limit': 6,
          },
      }, {
          'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
-        'md5': '9a167e54d04671eb6317a37b7bc8a280',
+        'md5': '8d40dab61cea8ab0114e090b029a0565',
          'info_dict': {
-            'id': 'MUHH48000314AA',
+            'id': 'MUHH48000314',
              'ext': 'mp4',
-            'title': '20 spørsmål 23.05.2014',
+            'title': '20 spørsmål - 23. mai 2014',
+            'alt_title': '23. mai 2014',
              'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
              'duration': 1741,
              'series': '20 spørsmål',
-            'episode': '23.05.2014',
+            'episode': '23. mai 2014',
+            'age_limit': 0,
          },
-        'skip': 'NoProgramRights',
      }, {
          'url': 'https://tv.nrk.no/program/mdfp15000514',
          'info_dict': {
-            'id': 'MDFP15000514CA',
+            'id': 'MDFP15000514',
              'ext': 'mp4',
-            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
+            'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting',
              'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
-            'duration': 4605,
+            'duration': 4605.08,
              'series': 'Kunnskapskanalen',
-            'episode': '24.05.2014',
+            'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
+            'age_limit': 0,
          },
          'params': {
              'skip_download': True,
@@ -194,63 +332,41 @@ class NRKTVIE(NRKBaseIE):
          # single playlist video
          'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
          'info_dict': {
-            'id': 'MSPO40010515-part2',
-            'ext': 'flv',
-            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
-            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            'id': 'MSPO40010515',
+            'ext': 'mp4',
+            'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
+            'description': 'md5:c03aba1e917561eface5214020551b7a',
+            'age_limit': 0,
          },
          'params': {
              'skip_download': True,
          },
-        'expected_warnings': ['Video is geo restricted'],
+        'expected_warnings': ['Failed to download m3u8 information'],
          'skip': 'particular part is not supported currently',
      }, {
          'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
-        'playlist': [{
-            'info_dict': {
-                'id': 'MSPO40010515AH',
-                'ext': 'mp4',
-                'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)',
-                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
-                'duration': 772,
-                'series': 'Tour de Ski',
-                'episode': '06.01.2015',
-            },
-            'params': {
-                'skip_download': True,
-            },
-        }, {
-            'info_dict': {
-                'id': 'MSPO40010515BH',
-                'ext': 'mp4',
-                'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)',
-                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
-                'duration': 6175,
-                'series': 'Tour de Ski',
-                'episode': '06.01.2015',
-            },
-            'params': {
-                'skip_download': True,
-            },
-        }],
          'info_dict': {
              'id': 'MSPO40010515',
+            'ext': 'mp4',
              'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
-            'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
+            'description': 'md5:c03aba1e917561eface5214020551b7a',
+            'age_limit': 0,
          },
-        'expected_warnings': ['Video is geo restricted'],
+        'expected_warnings': ['Failed to download m3u8 information'],
+        'skip': 'Ikke tilgjengelig utenfor Norge',
      }, {
          'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13',
          'info_dict': {
-            'id': 'KMTE50001317AA',
+            'id': 'KMTE50001317',
              'ext': 'mp4',
-            'title': 'Anno 13:30',
+            'title': 'Anno - 13. episode',
              'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa',
              'duration': 2340,
              'series': 'Anno',
-            'episode': '13:30',
+            'episode': '13. episode',
              'season_number': 3,
              'episode_number': 13,
+            'age_limit': 0,
          },
          'params': {
              'skip_download': True,
@@ -258,215 +374,50 @@ class NRKTVIE(NRKBaseIE):
      }, {
          'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017',
          'info_dict': {
-            'id': 'MUHH46000317AA',
+            'id': 'MUHH46000317',
              'ext': 'mp4',
              'title': 'Nytt på Nytt 27.01.2017',
              'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b',
              'duration': 1796,
              'series': 'Nytt på nytt',
              'episode': '27.01.2017',
+            'age_limit': 0,
          },
          'params': {
              'skip_download': True,
          },
+        'skip': 'ProgramRightsHasExpired',
      }, {
          'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
          'only_matching': True,
      }, {
          'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller',
          'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315',
+        'only_matching': True,
      }]
  
-    _api_host = None
-
-    def _extract_from_mediaelement(self, video_id):
-        api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
-
-        for api_host in api_hosts:
-            data = self._download_json(
-                'http://%s/mediaelement/%s' % (api_host, video_id),
-                video_id, 'Downloading mediaelement JSON',
-                fatal=api_host == api_hosts[-1])
-            if not data:
-                continue
-            self._api_host = api_host
-            break
-
-        title = data.get('fullTitle') or data.get('mainTitle') or data['title']
-        video_id = data.get('id') or video_id
-
-        entries = []
-
-        conviva = data.get('convivaStatistics') or {}
-        live = (data.get('mediaElementType') == 'Live'
-                or data.get('isLive') is True or conviva.get('isLive'))
-
-        def make_title(t):
-            return self._live_title(t) if live else t
-
-        media_assets = data.get('mediaAssets')
-        if media_assets and isinstance(media_assets, list):
-            def video_id_and_title(idx):
-                return ((video_id, title) if len(media_assets) == 1
-                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
-            for num, asset in enumerate(media_assets, 1):
-                asset_url = asset.get('url')
-                if not asset_url:
-                    continue
-                formats = self._extract_akamai_formats(asset_url, video_id)
-                if not formats:
-                    continue
-                self._sort_formats(formats)
-
-                # Some f4m streams may not work with hdcore in fragments' URLs
-                for f in formats:
-                    extra_param = f.get('extra_param_to_segment_url')
-                    if extra_param and 'hdcore' in extra_param:
-                        del f['extra_param_to_segment_url']
-
-                entry_id, entry_title = video_id_and_title(num)
-                duration = parse_duration(asset.get('duration'))
-                subtitles = {}
-                for subtitle in ('webVtt', 'timedText'):
-                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
-                    if subtitle_url:
-                        subtitles.setdefault('no', []).append({
-                            'url': compat_urllib_parse_unquote(subtitle_url)
-                        })
-                entries.append({
-                    'id': asset.get('carrierId') or entry_id,
-                    'title': make_title(entry_title),
-                    'duration': duration,
-                    'subtitles': subtitles,
-                    'formats': formats,
-                })
-
-        if not entries:
-            media_url = data.get('mediaUrl')
-            if media_url:
-                formats = self._extract_akamai_formats(media_url, video_id)
-                self._sort_formats(formats)
-                duration = parse_duration(data.get('duration'))
-                entries = [{
-                    'id': video_id,
-                    'title': make_title(title),
-                    'duration': duration,
-                    'formats': formats,
-                }]
-
-        if not entries:
-            MESSAGES = {
-                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
-                'ProgramRightsHasExpired': 'Programmet har gått ut',
-                'NoProgramRights': 'Ikke tilgjengelig',
-                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
-            }
-            message_type = data.get('messageType', '')
-            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
-            if 'IsGeoBlocked' in message_type:
-                self.raise_geo_restricted(
-                    msg=MESSAGES.get('ProgramIsGeoBlocked'),
-                    countries=self._GEO_COUNTRIES)
-            raise ExtractorError(
-                '%s said: %s' % (self.IE_NAME, MESSAGES.get(
-                    message_type, message_type)),
-                expected=True)
-
-        series = conviva.get('seriesName') or data.get('seriesTitle')
-        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
-
-        season_number = None
-        episode_number = None
-        if data.get('mediaElementType') == 'Episode':
-            _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \
-                data.get('relativeOriginUrl', '')
-            EPISODENUM_RE = [
-                r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.',
-                r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})',
-            ]
-            season_number = int_or_none(self._search_regex(
-                EPISODENUM_RE, _season_episode, 'season number',
-                default=None, group='season'))
-            episode_number = int_or_none(self._search_regex(
-                EPISODENUM_RE, _season_episode, 'episode number',
-                default=None, group='episode'))
-
-        thumbnails = None
-        images = data.get('images')
-        if images and isinstance(images, dict):
-            web_images = images.get('webImages')
-            if isinstance(web_images, list):
-                thumbnails = [{
-                    'url': image['imageUrl'],
-                    'width': int_or_none(image.get('width')),
-                    'height': int_or_none(image.get('height')),
-                } for image in web_images if image.get('imageUrl')]
-
-        description = data.get('description')
-        category = data.get('mediaAnalytics', {}).get('category')
-
-        common_info = {
-            'description': description,
-            'series': series,
-            'episode': episode,
-            'season_number': season_number,
-            'episode_number': episode_number,
-            'categories': [category] if category else None,
-            'age_limit': parse_age_limit(data.get('legalAge')),
-            'thumbnails': thumbnails,
-        }
-
-        vcodec = 'none' if data.get('mediaType') == 'Audio' else None
-
-        for entry in entries:
-            entry.update(common_info)
-            for f in entry['formats']:
-                f['vcodec'] = vcodec
-
-        points = data.get('shortIndexPoints')
-        if isinstance(points, list):
-            chapters = []
-            for next_num, point in enumerate(points, start=1):
-                if not isinstance(point, dict):
-                    continue
-                start_time = parse_duration(point.get('startPoint'))
-                if start_time is None:
-                    continue
-                end_time = parse_duration(
-                    data.get('duration')
-                    if next_num == len(points)
-                    else points[next_num].get('startPoint'))
-                if end_time is None:
-                    continue
-                chapters.append({
-                    'start_time': start_time,
-                    'end_time': end_time,
-                    'title': point.get('title'),
-                })
-            if chapters and len(entries) == 1:
-                entries[0]['chapters'] = chapters
-
-        return self.playlist_result(entries, video_id, title, description)
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        return self._extract_from_mediaelement(video_id)
+        return self.url_result(
+            'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id)
  
  
  class NRKTVEpisodeIE(InfoExtractor):
-    _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
+    _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/(?P<season_number>\d+)/episode/(?P<episode_number>\d+))'
      _TESTS = [{
          'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2',
          'info_dict': {
-            'id': 'MUHH36005220BA',
+            'id': 'MUHH36005220',
              'ext': 'mp4',
-            'title': 'Kro, krig og kjærlighet 2:6',
-            'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350',
-            'duration': 1563,
+            'title': 'Hellums kro - 2. Kro, krig og kjærlighet',
+            'description': 'md5:ad92ddffc04cea8ce14b415deef81787',
+            'duration': 1563.92,
              'series': 'Hellums kro',
              'season_number': 1,
              'episode_number': 2,
-            'episode': '2:6',
+            'episode': '2. Kro, krig og kjærlighet',
              'age_limit': 6,
          },
          'params': {
@@ -475,15 +426,16 @@ class NRKTVEpisodeIE(InfoExtractor):
      }, {
          'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
          'info_dict': {
-            'id': 'MSUI14000816AA',
+            'id': 'MSUI14000816',
              'ext': 'mp4',
-            'title': 'Backstage 8:30',
+            'title': 'Backstage - 8. episode',
              'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
              'duration': 1320,
              'series': 'Backstage',
              'season_number': 1,
              'episode_number': 8,
-            'episode': '8:30',
+            'episode': '8. episode',
+            'age_limit': 0,
          },
          'params': {
              'skip_download': True,
@@ -492,7 +444,7 @@ class NRKTVEpisodeIE(InfoExtractor):
      }]
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
+        display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups()
  
          webpage = self._download_webpage(url, display_id)
  
@@ -504,91 +456,170 @@ def _real_extract(self, url):
          assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
  
          info.update({
-            '_type': 'url_transparent',
+            '_type': 'url',
              'id': nrk_id,
              'url': 'nrk:%s' % nrk_id,
              'ie_key': NRKIE.ie_key(),
+            'season_number': int(season_number),
+            'episode_number': int(episode_number),
          })
          return info
  
  
-class NRKTVSerieBaseIE(InfoExtractor):
-    def _extract_series(self, webpage, display_id, fatal=True):
-        config = self._parse_json(
-            self._search_regex(
-                (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;',
-                 r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'),
-                webpage, 'config', default='{}' if not fatal else NO_DEFAULT),
-            display_id, fatal=False, transform_source=js_to_json)
-        if not config:
-            return
-        return try_get(
-            config,
-            (lambda x: x['initialState']['series'], lambda x: x['series']),
-            dict)
-
-    def _extract_seasons(self, seasons):
-        if not isinstance(seasons, list):
-            return []
-        entries = []
-        for season in seasons:
-            entries.extend(self._extract_episodes(season))
-        return entries
-
-    def _extract_episodes(self, season):
-        if not isinstance(season, dict):
-            return []
-        return self._extract_entries(season.get('episodes'))
-
+class NRKTVSerieBaseIE(NRKBaseIE):
      def _extract_entries(self, entry_list):
          if not isinstance(entry_list, list):
              return []
          entries = []
          for episode in entry_list:
-            nrk_id = episode.get('prfId')
+            nrk_id = episode.get('prfId') or episode.get('episodeId')
              if not nrk_id or not isinstance(nrk_id, compat_str):
                  continue
              entries.append(self.url_result(
                  'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
          return entries
  
+    _ASSETS_KEYS = ('episodes', 'instalments',)
+
+    def _extract_assets_key(self, embedded):
+        for asset_key in self._ASSETS_KEYS:
+            if embedded.get(asset_key):
+                return asset_key
+
+    @staticmethod
+    def _catalog_name(serie_kind):
+        return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series'
+
+    def _entries(self, data, display_id):
+        for page_num in itertools.count(1):
+            embedded = data.get('_embedded') or data
+            if not isinstance(embedded, dict):
+                break
+            assets_key = self._extract_assets_key(embedded)
+            if not assets_key:
+                break
+            # Extract entries
+            entries = try_get(
+                embedded,
+                (lambda x: x[assets_key]['_embedded'][assets_key],
+                 lambda x: x[assets_key]),
+                list)
+            for e in self._extract_entries(entries):
+                yield e
+            # Find next URL
+            next_url_path = try_get(
+                data,
+                (lambda x: x['_links']['next']['href'],
+                 lambda x: x['_embedded'][assets_key]['_links']['next']['href']),
+                compat_str)
+            if not next_url_path:
+                break
+            data = self._call_api(
+                next_url_path, display_id,
+                note='Downloading %s JSON page %d' % (assets_key, page_num),
+                fatal=False)
+            if not data:
+                break
+
  
  class NRKTVSeasonIE(NRKTVSerieBaseIE):
-    _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?P<domain>tv|radio)\.nrk\.no/
+                        (?P<serie_kind>serie|pod[ck]ast)/
+                        (?P<serie>[^/]+)/
+                        (?:
+                            (?:sesong/)?(?P<id>\d+)|
+                            sesong/(?P<id_2>[^/?#&]+)
+                        )
+                    '''
+    _TESTS = [{
          'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
          'info_dict': {
-            'id': '1',
+            'id': 'backstage/1',
              'title': 'Sesong 1',
          },
          'playlist_mincount': 30,
-    }
+    }, {
+        # no /sesong/ in path
+        'url': 'https://tv.nrk.no/serie/lindmo/2016',
+        'info_dict': {
+            'id': 'lindmo/2016',
+            'title': '2016',
+        },
+        'playlist_mincount': 29,
+    }, {
+        # weird nested _embedded in catalog JSON response
+        'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1',
+        'info_dict': {
+            'id': 'dickie-dick-dickens/1',
+            'title': 'Sesong 1',
+        },
+        'playlist_mincount': 11,
+    }, {
+        # 841 entries, multi page
+        'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509',
+        'info_dict': {
+            'id': 'dagsnytt/201509',
+            'title': 'September 2015',
+        },
+        'playlist_mincount': 841,
+    }, {
+        # 180 entries, single page
+        'url': 'https://tv.nrk.no/serie/spangas/sesong/1',
+        'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant',
+        'info_dict': {
+            'id': 'hele_historien/diagnose-kverulant',
+            'title': 'Diagnose kverulant',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101',
+        'only_matching': True,
+    }]
  
      @classmethod
      def suitable(cls, url):
-        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
+        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url)
                  else super(NRKTVSeasonIE, cls).suitable(url))
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        series = self._extract_series(webpage, display_id)
-
-        season = next(
-            s for s in series['seasons']
-            if int(display_id) == s.get('seasonNumber'))
-
-        title = try_get(season, lambda x: x['titles']['title'], compat_str)
+        mobj = re.match(self._VALID_URL, url)
+        domain = mobj.group('domain')
+        serie_kind = mobj.group('serie_kind')
+        serie = mobj.group('serie')
+        season_id = mobj.group('id') or mobj.group('id_2')
+        display_id = '%s/%s' % (serie, season_id)
+
+        data = self._call_api(
+            '%s/catalog/%s/%s/seasons/%s'
+            % (domain, self._catalog_name(serie_kind), serie, season_id),
+            display_id, 'season', query={'pageSize': 50})
+
+        title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id
          return self.playlist_result(
-            self._extract_episodes(season), display_id, title)
+            self._entries(data, display_id),
+            display_id, title)
  
  
  class NRKTVSeriesIE(NRKTVSerieBaseIE):
-    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
-    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
+    _VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?P<serie_kind>serie|pod[ck]ast)/(?P<id>[^/]+)'
      _TESTS = [{
+        # new layout, instalments
+        'url': 'https://tv.nrk.no/serie/groenn-glede',
+        'info_dict': {
+            'id': 'groenn-glede',
+            'title': 'Grønn glede',
+            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
+        },
+        'playlist_mincount': 90,
+    }, {
+        # new layout, instalments, more entries
+        'url': 'https://tv.nrk.no/serie/lindmo',
+        'only_matching': True,
+    }, {
          'url': 'https://tv.nrk.no/serie/blank',
          'info_dict': {
              'id': 'blank',
@@ -602,25 +633,16 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
          'info_dict': {
              'id': 'backstage',
              'title': 'Backstage',
-            'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
+            'description': 'md5:63692ceb96813d9a207e9910483d948b',
          },
          'playlist_mincount': 60,
-    }, {
-        # new layout, instalments
-        'url': 'https://tv.nrk.no/serie/groenn-glede',
-        'info_dict': {
-            'id': 'groenn-glede',
-            'title': 'Grønn glede',
-            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
-        },
-        'playlist_mincount': 10,
      }, {
          # old layout
          'url': 'https://tv.nrksuper.no/serie/labyrint',
          'info_dict': {
              'id': 'labyrint',
              'title': 'Labyrint',
-            'description': 'md5:318b597330fdac5959247c9b69fdb1ec',
+            'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.',
          },
          'playlist_mincount': 3,
      }, {
@@ -632,53 +654,75 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
      }, {
          'url': 'https://tv.nrk.no/serie/postmann-pat',
          'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/serie/dickie-dick-dickens',
+        'info_dict': {
+            'id': 'dickie-dick-dickens',
+            'title': 'Dickie Dick Dickens',
+            'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f',
+        },
+        'playlist_mincount': 8,
+    }, {
+        'url': 'https://nrksuper.no/serie/labyrint',
+        'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/ulrikkes_univers',
+        'info_dict': {
+            'id': 'ulrikkes_univers',
+        },
+        'playlist_mincount': 10,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000',
+        'only_matching': True,
      }]
  
      @classmethod
      def suitable(cls, url):
          return (
              False if any(ie.suitable(url)
-                         for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
+                         for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE))
              else super(NRKTVSeriesIE, cls).suitable(url))
  
      def _real_extract(self, url):
-        series_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, series_id)
-
-        # New layout (e.g. https://tv.nrk.no/serie/backstage)
-        series = self._extract_series(webpage, series_id, fatal=False)
-        if series:
-            title = try_get(series, lambda x: x['titles']['title'], compat_str)
-            description = try_get(
-                series, lambda x: x['titles']['subtitle'], compat_str)
-            entries = []
-            entries.extend(self._extract_seasons(series.get('seasons')))
-            entries.extend(self._extract_entries(series.get('instalments')))
-            entries.extend(self._extract_episodes(series.get('extraMaterial')))
-            return self.playlist_result(entries, series_id, title, description)
-
-        # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint)
-        entries = [
-            self.url_result(
-                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
-                    series=series_id, season=season_id))
-            for season_id in re.findall(self._ITEM_RE, webpage)
-        ]
-
-        title = self._html_search_meta(
-            'seriestitle', webpage,
-            'title', default=None) or self._og_search_title(
-            webpage, fatal=False)
-        if title:
-            title = self._search_regex(
-                r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title)
+        site, serie_kind, series_id = re.match(self._VALID_URL, url).groups()
+        is_radio = site == 'radio.nrk'
+        domain = 'radio' if is_radio else 'tv'
+
+        size_prefix = 'p' if is_radio else 'embeddedInstalmentsP'
+        series = self._call_api(
+            '%s/catalog/%s/%s'
+            % (domain, self._catalog_name(serie_kind), series_id),
+            series_id, 'serie', query={size_prefix + 'ageSize': 50})
+        titles = try_get(series, [
+            lambda x: x['titles'],
+            lambda x: x[x['type']]['titles'],
+            lambda x: x[x['seriesType']]['titles'],
+        ]) or {}
  
-        description = self._html_search_meta(
-            'series_description', webpage,
-            'description', default=None) or self._og_search_description(webpage)
+        entries = []
+        entries.extend(self._entries(series, series_id))
+        embedded = series.get('_embedded') or {}
+        linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or []
+        embedded_seasons = embedded.get('seasons') or []
+        if len(linked_seasons) > len(embedded_seasons):
+            for season in linked_seasons:
+                season_url = urljoin(url, season.get('href'))
+                if not season_url:
+                    season_name = season.get('name')
+                    if season_name and isinstance(season_name, compat_str):
+                        season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name)
+                if season_url:
+                    entries.append(self.url_result(
+                        season_url, ie=NRKTVSeasonIE.ie_key(),
+                        video_title=season.get('title')))
+        else:
+            for season in embedded_seasons:
+                entries.extend(self._entries(season, series_id))
+        entries.extend(self._entries(
+            embedded.get('extraMaterial') or {}, series_id))
  
-        return self.playlist_result(entries, series_id, title, description)
+        return self.playlist_result(
+            entries, series_id, titles.get('title'), titles.get('subtitle'))
  
  
  class NRKTVDirekteIE(NRKTVIE):
@@ -694,6 +738,38 @@ class NRKTVDirekteIE(NRKTVIE):
      }]
  
  
+class NRKRadioPodkastIE(InfoExtractor):
+    _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?P<id>l_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+    _TESTS = [{
+        'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
+        'md5': '8d40dab61cea8ab0114e090b029a0565',
+        'info_dict': {
+            'id': 'MUHH48000314AA',
+            'ext': 'mp4',
+            'title': '20 spørsmål 23.05.2014',
+            'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+            'duration': 1741,
+            'series': '20 spørsmål',
+            'episode': '23.05.2014',
+        },
+    }, {
+        'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
+        'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
+        'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id)
+
+
  class NRKPlaylistBaseIE(InfoExtractor):
      def _extract_description(self, webpage):
          pass
@@ -782,14 +858,8 @@ class NRKSkoleIE(InfoExtractor):
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        webpage = self._download_webpage(
-            'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id,
-            video_id)
-
-        nrk_id = self._parse_json(
-            self._search_regex(
-                r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>',
-                webpage, 'application json'),
-            video_id)['activeMedia']['psId']
+        nrk_id = self._download_json(
+            'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id,
+            video_id)['psId']
  
          return self.url_result('nrk:%s' % nrk_id)
diff --git a/youtube_dlc/extractor/peertube.py b/youtube_dlc/extractor/peertube.py

index 48fb9541693c35878317f22ed9dd6e2da4412ced..c39d12728d4fb2aa61494de77c9964d76d45d355 100644 (file)
--- a/youtube_dlc/extractor/peertube.py
+++ b/youtube_dlc/extractor/peertube.py
@@ -541,6 +541,10 @@ def _real_extract(self, url):
                  'format_id': format_id,
                  'filesize': file_size,
              })
+            if format_id == '0p':
+                f['vcodec'] = 'none'
+            else:
+                f['fps'] = int_or_none(file_.get('fps'))
              formats.append(f)
          self._sort_formats(formats)
  
diff --git a/youtube_dlc/extractor/piksel.py b/youtube_dlc/extractor/piksel.py

index 88b6859b01a7c51eebe9f129d759f68005c75ce6..ecf56ff8f69588b08fc578234f116b1372953cdf 100644 (file)
--- a/youtube_dlc/extractor/piksel.py
+++ b/youtube_dlc/extractor/piksel.py
@@ -6,16 +6,33 @@
  from .common import InfoExtractor
  from ..compat import compat_str
  from ..utils import (
-    ExtractorError,
      dict_get,
+    ExtractorError,
      int_or_none,
-    unescapeHTML,
      parse_iso8601,
+    try_get,
+    unescapeHTML,
  )
  
  
  class PikselIE(InfoExtractor):
-    _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P<id>[a-z0-9_]+)'
+    _VALID_URL = r'''(?x)https?://
+        (?:
+            (?:
+                player\.
+                    (?:
+                        olympusattelecom|
+                        vibebyvista
+                    )|
+                (?:api|player)\.multicastmedia|
+                (?:api-ovp|player)\.piksel
+            )\.com|
+            (?:
+                mz-edge\.stream\.co|
+                movie-s\.nhk\.or
+            )\.jp|
+            vidego\.baltimorecity\.gov
+        )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)'''
      _TESTS = [
          {
              'url': 'http://player.piksel.com/v/ums2867l',
@@ -56,46 +73,41 @@ def _extract_url(webpage):
          if mobj:
              return mobj.group('url')
  
+    def _call_api(self, app_token, resource, display_id, query, fatal=True):
+        response = (self._download_json(
+            'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token),
+            display_id, query=query, fatal=fatal) or {}).get('response')
+        failure = try_get(response, lambda x: x['failure']['reason'])
+        if failure:
+            if fatal:
+                raise ExtractorError(failure, expected=True)
+            self.report_warning(failure)
+        return response
+
      def _real_extract(self, url):
-        display_id = self._match_id(url)
+        ref_id, display_id = re.match(self._VALID_URL, url).groups()
          webpage = self._download_webpage(url, display_id)
-        video_id = self._search_regex(
-            r'data-de-program-uuid=[\'"]([a-z0-9]+)',
-            webpage, 'program uuid', default=display_id)
          app_token = self._search_regex([
              r'clientAPI\s*:\s*"([^"]+)"',
              r'data-de-api-key\s*=\s*"([^"]+)"'
          ], webpage, 'app token')
-        response = self._download_json(
-            'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token,
-            video_id, query={
-                'v': video_id
-            })['response']
-        failure = response.get('failure')
-        if failure:
-            raise ExtractorError(response['failure']['reason'], expected=True)
-        video_data = response['WsProgramResponse']['program']['asset']
+        query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id}
+        program = self._call_api(
+            app_token, 'program', display_id, query)['WsProgramResponse']['program']
+        video_id = program['uuid']
+        video_data = program['asset']
          title = video_data['title']
+        asset_type = dict_get(video_data, ['assetType', 'asset_type'])
  
          formats = []
  
-        m3u8_url = dict_get(video_data, [
-            'm3u8iPadURL',
-            'ipadM3u8Url',
-            'm3u8AndroidURL',
-            'm3u8iPhoneURL',
-            'iphoneM3u8Url'])
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, video_id, 'mp4', 'm3u8_native',
-                m3u8_id='hls', fatal=False))
-
-        asset_type = dict_get(video_data, ['assetType', 'asset_type'])
-        for asset_file in video_data.get('assetFiles', []):
+        def process_asset_file(asset_file):
+            if not asset_file:
+                return
              # TODO: extract rtmp formats
              http_url = asset_file.get('http_url')
              if not http_url:
-                continue
+                return
              tbr = None
              vbr = int_or_none(asset_file.get('videoBitrate'), 1024)
              abr = int_or_none(asset_file.get('audioBitrate'), 1024)
@@ -118,6 +130,43 @@ def _real_extract(self, url):
                  'filesize': int_or_none(asset_file.get('filesize')),
                  'tbr': tbr,
              })
+
+        def process_asset_files(asset_files):
+            for asset_file in (asset_files or []):
+                process_asset_file(asset_file)
+
+        process_asset_files(video_data.get('assetFiles'))
+        process_asset_file(video_data.get('referenceFile'))
+        if not formats:
+            asset_id = video_data.get('assetid') or program.get('assetid')
+            if asset_id:
+                process_asset_files(try_get(self._call_api(
+                    app_token, 'asset_file', display_id, {
+                        'assetid': asset_id,
+                    }, False), lambda x: x['WsAssetFileResponse']['AssetFiles']))
+
+        m3u8_url = dict_get(video_data, [
+            'm3u8iPadURL',
+            'ipadM3u8Url',
+            'm3u8AndroidURL',
+            'm3u8iPhoneURL',
+            'iphoneM3u8Url'])
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil'])
+        if smil_url:
+            transform_source = None
+            if ref_id == 'nhkworld':
+                # TODO: figure out if this is something to be fixed in urljoin,
+                # _parse_smil_formats or keep it here
+                transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"')
+            formats.extend(self._extract_smil_formats(
+                re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id,
+                transform_source=transform_source, fatal=False))
+
          self._sort_formats(formats)
  
          subtitles = {}
diff --git a/youtube_dlc/extractor/pornhub.py b/youtube_dlc/extractor/pornhub.py

index 529f3f7119fd4e93a8c82168d910ab4fc3d1720e..2fcbd186f84fb4a629c154f84339c6795bbfaf06 100644 (file)
--- a/youtube_dlc/extractor/pornhub.py
+++ b/youtube_dlc/extractor/pornhub.py
@@ -31,7 +31,12 @@ def _download_webpage_handle(self, *args, **kwargs):
          def dl(*args, **kwargs):
              return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  
-        webpage, urlh = dl(*args, **kwargs)
+        ret = dl(*args, **kwargs)
+
+        if not ret:
+            return ret
+
+        webpage, urlh = ret
  
          if any(re.search(p, webpage) for p in (
                  r'<body\b[^>]+\bonload=["\']go\(\)',
@@ -53,7 +58,7 @@ class PornHubIE(PornHubBaseIE):
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
-                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                              (?:www\.)?thumbzilla\.com/video/
                          )
                          (?P<id>[\da-z]+)
@@ -152,6 +157,9 @@ class PornHubIE(PornHubBaseIE):
      }, {
          'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
          'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
+        'only_matching': True,
      }, {
          'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
          'only_matching': True,
@@ -160,7 +168,7 @@ class PornHubIE(PornHubBaseIE):
      @staticmethod
      def _extract_urls(webpage):
          return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)',
              webpage)
  
      def _extract_count(self, pattern, webpage, name):
@@ -280,14 +288,24 @@ def add_video_url(video_url):
              video_urls.append((v_url, None))
              video_urls_set.add(v_url)
  
+        def parse_quality_items(quality_items):
+            q_items = self._parse_json(quality_items, video_id, fatal=False)
+            if not isinstance(q_items, list):
+                return
+            for item in q_items:
+                if isinstance(item, dict):
+                    add_video_url(item.get('url'))
+
          if not video_urls:
-            FORMAT_PREFIXES = ('media', 'quality')
+            FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
              js_vars = extract_js_vars(
                  webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
                  default=None)
              if js_vars:
                  for key, format_url in js_vars.items():
-                    if any(key.startswith(p) for p in FORMAT_PREFIXES):
+                    if key.startswith(FORMAT_PREFIXES[-1]):
+                        parse_quality_items(format_url)
+                    elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
                          add_video_url(format_url)
              if not video_urls and re.search(
                      r'<[^>]+\bid=["\']lockedPlayer', webpage):
@@ -343,12 +361,16 @@ def add_video_url(video_url):
              r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
              webpage, 'uploader', default=None)
  
+        def extract_vote_count(kind, name):
+            return self._extract_count(
+                (r'<span[^>]+\bclass="votes%s"[^>]*>([\d,\.]+)</span>' % kind,
+                 r'<span[^>]+\bclass=["\']votes%s["\'][^>]*\bdata-rating=["\'](\d+)' % kind),
+                webpage, name)
+
          view_count = self._extract_count(
              r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
-        like_count = self._extract_count(
-            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
-        dislike_count = self._extract_count(
-            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+        like_count = extract_vote_count('Up', 'like')
+        dislike_count = extract_vote_count('Down', 'dislike')
          comment_count = self._extract_count(
              r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  
@@ -422,7 +444,7 @@ def _real_extract(self, url):
  
  
  class PornHubUserIE(PornHubPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
      _TESTS = [{
          'url': 'https://www.pornhub.com/model/zoe_ph',
          'playlist_mincount': 118,
@@ -490,7 +512,7 @@ def _real_extract(self, url):
  
  
  class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
      _TESTS = [{
          'url': 'https://www.pornhub.com/model/zoe_ph/videos',
          'only_matching': True,
@@ -605,7 +627,7 @@ def suitable(cls, url):
  
  
  class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
      _TESTS = [{
          'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
          'info_dict': {
diff --git a/youtube_dlc/extractor/reddit.py b/youtube_dlc/extractor/reddit.py

index cd9125388872f6f54f8d79494de719dbe67e2ee9..77f66c9664d5cb2cc21a9abb80b067bd04e57839 100644 (file)
--- a/youtube_dlc/extractor/reddit.py
+++ b/youtube_dlc/extractor/reddit.py
@@ -7,6 +7,8 @@
      ExtractorError,
      int_or_none,
      float_or_none,
+    try_get,
+    unescapeHTML,
      url_or_none,
  )
  
@@ -55,10 +57,12 @@ class RedditRIE(InfoExtractor):
              'id': 'zv89llsvexdz',
              'ext': 'mp4',
              'title': 'That small heart attack.',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+            'thumbnails': 'count:4',
              'timestamp': 1501941939,
              'upload_date': '20170805',
              'uploader': 'Antw87',
+            'duration': 12,
              'like_count': int,
              'dislike_count': int,
              'comment_count': int,
@@ -116,13 +120,40 @@ def _real_extract(self, url):
          else:
              age_limit = None
  
+        thumbnails = []
+
+        def add_thumbnail(src):
+            if not isinstance(src, dict):
+                return
+            thumbnail_url = url_or_none(src.get('url'))
+            if not thumbnail_url:
+                return
+            thumbnails.append({
+                'url': unescapeHTML(thumbnail_url),
+                'width': int_or_none(src.get('width')),
+                'height': int_or_none(src.get('height')),
+            })
+
+        for image in try_get(data, lambda x: x['preview']['images']) or []:
+            if not isinstance(image, dict):
+                continue
+            add_thumbnail(image.get('source'))
+            resolutions = image.get('resolutions')
+            if isinstance(resolutions, list):
+                for resolution in resolutions:
+                    add_thumbnail(resolution)
+
          return {
              '_type': 'url_transparent',
              'url': video_url,
              'title': data.get('title'),
-            'thumbnail': url_or_none(data.get('thumbnail')),
+            'thumbnails': thumbnails,
              'timestamp': float_or_none(data.get('created_utc')),
              'uploader': data.get('author'),
+            'duration': int_or_none(try_get(
+                data,
+                (lambda x: x['media']['reddit_video']['duration'],
+                 lambda x: x['secure_media']['reddit_video']['duration']))),
              'like_count': int_or_none(data.get('ups')),
              'dislike_count': int_or_none(data.get('downs')),
              'comment_count': int_or_none(data.get('num_comments')),
diff --git a/youtube_dlc/extractor/ruutu.py b/youtube_dlc/extractor/ruutu.py

index f984040aa07b08f56b1028d0dde1dc3fc78137cb..c50cd3ecd8c20ab0d07c3da70091dc155269688c 100644 (file)
--- a/youtube_dlc/extractor/ruutu.py
+++ b/youtube_dlc/extractor/ruutu.py
@@ -6,14 +6,24 @@
  from ..utils import (
      determine_ext,
      ExtractorError,
+    find_xpath_attr,
      int_or_none,
+    unified_strdate,
+    url_or_none,
      xpath_attr,
      xpath_text,
  )
  
  
  class RuutuIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/|
+                            static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid=
+                        )
+                        (?P<id>\d+)
+                    '''
      _TESTS = [
          {
              'url': 'http://www.ruutu.fi/video/2058907',
@@ -71,15 +81,53 @@ class RuutuIE(InfoExtractor):
                  'thumbnail': r're:^https?://.*\.jpg$',
                  'age_limit': 0,
              },
-            'expected_warnings': ['HTTP Error 502: Bad Gateway'],
-        }
+            'expected_warnings': [
+                'HTTP Error 502: Bad Gateway',
+                'Failed to download m3u8 information',
+            ],
+        },
+        {
+            'url': 'http://www.supla.fi/audio/2231370',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790',
+            'only_matching': True,
+        },
+        {
+            # episode
+            'url': 'https://www.ruutu.fi/video/3401964',
+            'info_dict': {
+                'id': '3401964',
+                'ext': 'mp4',
+                'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17',
+                'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 2582,
+                'age_limit': 12,
+                'upload_date': '20190508',
+                'series': 'Temptation Island Suomi',
+                'season_number': 5,
+                'episode_number': 17,
+                'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'],
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # premium
+            'url': 'https://www.ruutu.fi/video/3618715',
+            'only_matching': True,
+        },
      ]
+    _API_BASE = 'https://gatling.nelonenmedia.fi'
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
          video_xml = self._download_xml(
-            'https://gatling.nelonenmedia.fi/media-xml-cache', video_id,
+            '%s/media-xml-cache' % self._API_BASE, video_id,
              query={'id': video_id})
  
          formats = []
@@ -96,9 +144,18 @@ def extract_formats(node):
                          continue
                      processed_urls.append(video_url)
                      ext = determine_ext(video_url)
+                    auth_video_url = url_or_none(self._download_webpage(
+                        '%s/auth/access/v2' % self._API_BASE, video_id,
+                        note='Downloading authenticated %s stream URL' % ext,
+                        fatal=False, query={'stream': video_url}))
+                    if auth_video_url:
+                        processed_urls.append(auth_video_url)
+                        video_url = auth_video_url
                      if ext == 'm3u8':
                          formats.extend(self._extract_m3u8_formats(
-                            video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                            video_url, video_id, 'mp4',
+                            entry_protocol='m3u8_native', m3u8_id='hls',
+                            fatal=False))
                      elif ext == 'f4m':
                          formats.extend(self._extract_f4m_formats(
                              video_url, video_id, f4m_id='hds', fatal=False))
@@ -136,18 +193,35 @@ def extract_formats(node):
  
          extract_formats(video_xml.find('./Clip'))
  
-        drm = xpath_text(video_xml, './Clip/DRM', default=None)
-        if not formats and drm:
-            raise ExtractorError('This video is DRM protected.', expected=True)
+        def pv(name):
+            node = find_xpath_attr(
+                video_xml, './Clip/PassthroughVariables/variable', 'name', name)
+            if node is not None:
+                return node.get('value')
+
+        if not formats:
+            drm = xpath_text(video_xml, './Clip/DRM', default=None)
+            if drm:
+                raise ExtractorError('This video is DRM protected.', expected=True)
+            ns_st_cds = pv('ns_st_cds')
+            if ns_st_cds != 'free':
+                raise ExtractorError('This video is %s.' % ns_st_cds, expected=True)
  
          self._sort_formats(formats)
  
+        themes = pv('themes')
+
          return {
              'id': video_id,
              'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True),
              'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'),
              'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),
-            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')),
              'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+            'upload_date': unified_strdate(pv('date_start')),
+            'series': pv('series_name'),
+            'season_number': int_or_none(pv('season_number')),
+            'episode_number': int_or_none(pv('episode_number')),
+            'categories': themes.split(',') if themes else [],
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/sevenplus.py b/youtube_dlc/extractor/sevenplus.py

index 84568ac69f4bc761faa20fb3039d3070ae109ba5..240afc18f62872d43bf26e07cbca97f9249ea820 100644 (file)
--- a/youtube_dlc/extractor/sevenplus.py
+++ b/youtube_dlc/extractor/sevenplus.py
@@ -4,8 +4,12 @@
  import re
  
  from .brightcove import BrightcoveNewIE
-from ..compat import compat_str
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
  from ..utils import (
+    ExtractorError,
      try_get,
      update_url_query,
  )
@@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE):
      def _real_extract(self, url):
          path, episode_id = re.match(self._VALID_URL, url).groups()
  
-        media = self._download_json(
-            'https://videoservice.swm.digital/playback', episode_id, query={
-                'appId': '7plus',
-                'deviceType': 'web',
-                'platformType': 'web',
-                'accountId': 5303576322001,
-                'referenceId': 'ref:' + episode_id,
-                'deliveryId': 'csai',
-                'videoType': 'vod',
-            })['media']
+        try:
+            media = self._download_json(
+                'https://videoservice.swm.digital/playback', episode_id, query={
+                    'appId': '7plus',
+                    'deviceType': 'web',
+                    'platformType': 'web',
+                    'accountId': 5303576322001,
+                    'referenceId': 'ref:' + episode_id,
+                    'deliveryId': 'csai',
+                    'videoType': 'vod',
+                })['media']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                raise ExtractorError(self._parse_json(
+                    e.cause.read().decode(), episode_id)[0]['error_code'], expected=True)
+            raise
  
          for source in media.get('sources', {}):
              src = source.get('src')
diff --git a/youtube_dlc/extractor/sky.py b/youtube_dlc/extractor/sky.py

index ea30d6e62e1094b112a0c32c6c67e75cbb927406..ff2c977a02bff10543d96da3694e57e1d9d33d0a 100644 (file)
--- a/youtube_dlc/extractor/sky.py
+++ b/youtube_dlc/extractor/sky.py
@@ -1,6 +1,8 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import re
+
  from .common import InfoExtractor
  from ..utils import (
      extract_attributes,
@@ -11,38 +13,61 @@
  
  
  class SkyBaseIE(InfoExtractor):
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        video_data = extract_attributes(self._search_regex(
-            r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)',
-            webpage, 'video data'))
-
-        video_url = 'ooyala:%s' % video_data['data-video-id']
-        if video_data.get('data-token-required') == 'true':
-            token_fetch_options = self._parse_json(video_data.get(
-                'data-token-fetch-options', '{}'), video_id, fatal=False) or {}
-            token_fetch_url = token_fetch_options.get('url')
-            if token_fetch_url:
-                embed_token = self._download_webpage(urljoin(
-                    url, token_fetch_url), video_id, fatal=False)
-                if embed_token:
-                    video_url = smuggle_url(
-                        video_url, {'embed_token': embed_token.strip('"')})
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+    _SDC_EL_REGEX = r'(?s)(<div[^>]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)'
+
+    def _process_ooyala_element(self, webpage, sdc_el, url):
+        sdc = extract_attributes(sdc_el)
+        provider = sdc.get('data-provider')
+        if provider == 'ooyala':
+            video_id = sdc['data-sdc-video-id']
+            video_url = 'ooyala:%s' % video_id
+            ie_key = 'Ooyala'
+            ooyala_el = self._search_regex(
+                r'(<div[^>]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id,
+                webpage, 'video data', fatal=False)
+            if ooyala_el:
+                ooyala_attrs = extract_attributes(ooyala_el) or {}
+                if ooyala_attrs.get('data-token-required') == 'true':
+                    token_fetch_url = (self._parse_json(ooyala_attrs.get(
+                        'data-token-fetch-options', '{}'),
+                        video_id, fatal=False) or {}).get('url')
+                    if token_fetch_url:
+                        embed_token = self._download_json(urljoin(
+                            url, token_fetch_url), video_id, fatal=False)
+                        if embed_token:
+                            video_url = smuggle_url(
+                                video_url, {'embed_token': embed_token})
+        elif provider == 'brightcove':
+            video_id = sdc['data-video-id']
+            account_id = sdc.get('data-account-id') or '6058004172001'
+            player_id = sdc.get('data-player-id') or 'RC9PQUaJ6'
+            video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id)
+            ie_key = 'BrightcoveNew'
  
          return {
              '_type': 'url_transparent',
              'id': video_id,
              'url': video_url,
+            'ie_key': ie_key,
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        info = self._process_ooyala_element(webpage, self._search_regex(
+            self._SDC_EL_REGEX, webpage, 'sdc element'), url)
+        info.update({
              'title': self._og_search_title(webpage),
              'description': strip_or_none(self._og_search_description(webpage)),
-            'ie_key': 'Ooyala',
-        }
+        })
+        return info
  
  
  class SkySportsIE(SkyBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
-    _TEST = {
+    IE_NAME = 'sky:sports'
+    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)'
+    _TESTS = [{
          'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
          'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec',
          'info_dict': {
@@ -52,19 +77,55 @@ class SkySportsIE(SkyBaseIE):
              'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d',
          },
          'add_ie': ['Ooyala'],
-    }
+    }, {
+        'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.skysports.com/watch/video/tv-shows/12118508/rainford-brent-how-ace-programme-helps',
+        'only_matching': True,
+    }]
  
  
  class SkyNewsIE(SkyBaseIE):
+    IE_NAME = 'sky:news'
      _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)'
      _TEST = {
          'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962',
-        'md5': 'd6327e581473cea9976a3236ded370cd',
+        'md5': '411e8893fd216c75eaf7e4c65d364115',
          'info_dict': {
-            'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM',
+            'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM',
              'ext': 'mp4',
              'title': 'Russian plane inspected after deadly fire',
              'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.',
+            'uploader_id': '6058004172001',
+            'timestamp': 1567112345,
+            'upload_date': '20190829',
          },
-        'add_ie': ['Ooyala'],
+        'add_ie': ['BrightcoveNew'],
+    }
+
+
+class SkySportsNewsIE(SkyBaseIE):
+    IE_NAME = 'sky:sports:news'
+    _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass',
+        'info_dict': {
+            'id': '10871916',
+            'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass',
+            'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.',
+        },
+        'playlist_count': 2,
      }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+
+        entries = []
+        for sdc_el in re.findall(self._SDC_EL_REGEX, webpage):
+            entries.append(self._process_ooyala_element(webpage, sdc_el, url))
+
+        return self.playlist_result(
+            entries, article_id, self._og_search_title(webpage),
+            self._html_search_meta(['og:description', 'description'], webpage))
diff --git a/youtube_dlc/extractor/slideslive.py b/youtube_dlc/extractor/slideslive.py

index d9ea76831e7ebdc37bad0e722cd92268bfd8eb6f..9409a010090029bc9df30a667cdaa63e08e1b835 100644 (file)
--- a/youtube_dlc/extractor/slideslive.py
+++ b/youtube_dlc/extractor/slideslive.py
@@ -2,7 +2,12 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..utils import (
+    bool_or_none,
+    smuggle_url,
+    try_get,
+    url_or_none,
+)
  
  
  class SlidesLiveIE(InfoExtractor):
@@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor):
              'description': 'Watch full version of this video at https://slideslive.com/38902413.',
              'uploader': 'SlidesLive Videos - A',
              'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
+            'timestamp': 1597615266,
              'upload_date': '20170925',
          }
+    }, {
+        # video_service_name = yoda
+        'url': 'https://slideslive.com/38935785',
+        'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a',
+        'info_dict': {
+            'id': 'RMraDYN5ozA_',
+            'ext': 'mp4',
+            'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
      }, {
          # video_service_name = youtube
          'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
@@ -39,18 +57,48 @@ def _real_extract(self, url):
          video_data = self._download_json(
              'https://ben.slideslive.com/player/' + video_id, video_id)
          service_name = video_data['video_service_name'].lower()
-        assert service_name in ('url', 'vimeo', 'youtube')
+        assert service_name in ('url', 'yoda', 'vimeo', 'youtube')
          service_id = video_data['video_service_id']
+        subtitles = {}
+        for sub in try_get(video_data, lambda x: x['subtitles'], list) or []:
+            if not isinstance(sub, dict):
+                continue
+            webvtt_url = url_or_none(sub.get('webvtt_url'))
+            if not webvtt_url:
+                continue
+            lang = sub.get('language') or 'en'
+            subtitles.setdefault(lang, []).append({
+                'url': webvtt_url,
+            })
          info = {
              'id': video_id,
              'thumbnail': video_data.get('thumbnail'),
-            'url': service_id,
+            'is_live': bool_or_none(video_data.get('is_live')),
+            'subtitles': subtitles,
          }
-        if service_name == 'url':
+        if service_name in ('url', 'yoda'):
              info['title'] = video_data['title']
+            if service_name == 'url':
+                info['url'] = service_id
+            else:
+                formats = []
+                _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s'
+                # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
+                formats.extend(self._extract_m3u8_formats(
+                    _MANIFEST_PATTERN % (service_id, 'm3u8'),
+                    service_id, 'mp4', m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_mpd_formats(
+                    _MANIFEST_PATTERN % (service_id, 'mpd'), service_id,
+                    mpd_id='dash', fatal=False))
+                self._sort_formats(formats)
+                info.update({
+                    'id': service_id,
+                    'formats': formats,
+                })
          else:
              info.update({
                  '_type': 'url_transparent',
+                'url': service_id,
                  'ie_key': service_name.capitalize(),
                  'title': video_data.get('title'),
              })
diff --git a/youtube_dlc/extractor/smotri.py b/youtube_dlc/extractor/smotri.py

deleted file mode 100644 (file)

index 45995f3..0000000
--- a/youtube_dlc/extractor/smotri.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import json
-import hashlib
-import uuid
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    int_or_none,
-    sanitized_Request,
-    unified_strdate,
-    urlencode_postdata,
-    xpath_text,
-)
-
-
-class SmotriIE(InfoExtractor):
-    IE_DESC = 'Smotri.com'
-    IE_NAME = 'smotri'
-    _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
-    _NETRC_MACHINE = 'smotri'
-
-    _TESTS = [
-        # real video id 2610366
-        {
-            'url': 'http://smotri.com/video/view/?id=v261036632ab',
-            'md5': '02c0dfab2102984e9c5bb585cc7cc321',
-            'info_dict': {
-                'id': 'v261036632ab',
-                'ext': 'mp4',
-                'title': 'катастрофа с камер видеонаблюдения',
-                'uploader': 'rbc2008',
-                'uploader_id': 'rbc08',
-                'upload_date': '20131118',
-                'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
-            },
-        },
-        # real video id 57591
-        {
-            'url': 'http://smotri.com/video/view/?id=v57591cb20',
-            'md5': '830266dfc21f077eac5afd1883091bcd',
-            'info_dict': {
-                'id': 'v57591cb20',
-                'ext': 'flv',
-                'title': 'test',
-                'uploader': 'Support Photofile@photofile',
-                'uploader_id': 'support-photofile',
-                'upload_date': '20070704',
-                'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
-            },
-        },
-        # video-password, not approved by moderator
-        {
-            'url': 'http://smotri.com/video/view/?id=v1390466a13c',
-            'md5': 'f6331cef33cad65a0815ee482a54440b',
-            'info_dict': {
-                'id': 'v1390466a13c',
-                'ext': 'mp4',
-                'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
-                'uploader': 'timoxa40',
-                'uploader_id': 'timoxa40',
-                'upload_date': '20100404',
-                'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
-            },
-            'params': {
-                'videopassword': 'qwerty',
-            },
-            'skip': 'Video is not approved by moderator',
-        },
-        # video-password
-        {
-            'url': 'http://smotri.com/video/view/?id=v6984858774#',
-            'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
-            'info_dict': {
-                'id': 'v6984858774',
-                'ext': 'mp4',
-                'title': 'Дача Солженицина ПАРОЛЬ 223322',
-                'uploader': 'psavari1',
-                'uploader_id': 'psavari1',
-                'upload_date': '20081103',
-                'thumbnail': r're:^https?://.*\.jpg$',
-            },
-            'params': {
-                'videopassword': '223322',
-            },
-        },
-        # age limit + video-password, not approved by moderator
-        {
-            'url': 'http://smotri.com/video/view/?id=v15408898bcf',
-            'md5': '91e909c9f0521adf5ee86fbe073aad70',
-            'info_dict': {
-                'id': 'v15408898bcf',
-                'ext': 'flv',
-                'title': 'этот ролик не покажут по ТВ',
-                'uploader': 'zzxxx',
-                'uploader_id': 'ueggb',
-                'upload_date': '20101001',
-                'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
-                'age_limit': 18,
-            },
-            'params': {
-                'videopassword': '333'
-            },
-            'skip': 'Video is not approved by moderator',
-        },
-        # age limit + video-password
-        {
-            'url': 'http://smotri.com/video/view/?id=v7780025814',
-            'md5': 'b4599b068422559374a59300c5337d72',
-            'info_dict': {
-                'id': 'v7780025814',
-                'ext': 'mp4',
-                'title': 'Sexy Beach (пароль 123)',
-                'uploader': 'вАся',
-                'uploader_id': 'asya_prosto',
-                'upload_date': '20081218',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'age_limit': 18,
-            },
-            'params': {
-                'videopassword': '123'
-            },
-        },
-        # swf player
-        {
-            'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
-            'md5': '31099eeb4bc906712c5f40092045108d',
-            'info_dict': {
-                'id': 'v9188090500',
-                'ext': 'mp4',
-                'title': 'Shakira - Don\'t Bother',
-                'uploader': 'HannahL',
-                'uploader_id': 'lisaha95',
-                'upload_date': '20090331',
-                'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
-            },
-        },
-    ]
-
-    @classmethod
-    def _extract_url(cls, webpage):
-        mobj = re.search(
-            r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
-            webpage)
-        if mobj is not None:
-            return mobj.group('url')
-
-        mobj = re.search(
-            r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
-                    <div\s+class="video_image">[^<]+</div>\s*
-                    <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
-        if mobj is not None:
-            return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
-
-    def _search_meta(self, name, html, display_name=None):
-        if display_name is None:
-            display_name = name
-        return self._html_search_meta(name, html, display_name)
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        video_form = {
-            'ticket': video_id,
-            'video_url': '1',
-            'frame_url': '1',
-            'devid': 'LoadupFlashPlayer',
-            'getvideoinfo': '1',
-        }
-
-        video_password = self._downloader.params.get('videopassword')
-        if video_password:
-            video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
-
-        video = self._download_json(
-            'http://smotri.com/video/view/url/bot/',
-            video_id, 'Downloading video JSON',
-            data=urlencode_postdata(video_form),
-            headers={'Content-Type': 'application/x-www-form-urlencoded'})
-
-        video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
-
-        if not video_url:
-            if video.get('_moderate_no'):
-                raise ExtractorError(
-                    'Video %s has not been approved by moderator' % video_id, expected=True)
-
-            if video.get('error'):
-                raise ExtractorError('Video %s does not exist' % video_id, expected=True)
-
-            if video.get('_pass_protected') == 1:
-                msg = ('Invalid video password' if video_password
-                       else 'This video is protected by a password, use the --video-password option')
-                raise ExtractorError(msg, expected=True)
-
-        title = video['title']
-        thumbnail = video.get('_imgURL')
-        upload_date = unified_strdate(video.get('added'))
-        uploader = video.get('userNick')
-        uploader_id = video.get('userLogin')
-        duration = int_or_none(video.get('duration'))
-
-        # Video JSON does not provide enough meta data
-        # We will extract some from the video web page instead
-        webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id
-        webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page')
-
-        # Warning if video is unavailable
-        warning = self._html_search_regex(
-            r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage,
-            'warning message', default=None)
-        if warning is not None:
-            self._downloader.report_warning(
-                'Video %s may not be available; smotri said: %s ' %
-                (video_id, warning))
-
-        # Adult content
-        if 'EroConfirmText">' in webpage:
-            self.report_age_confirmation()
-            confirm_string = self._html_search_regex(
-                r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id,
-                webpage, 'confirm string')
-            confirm_url = webpage_url + '&confirm=%s' % confirm_string
-            webpage = self._download_webpage(
-                confirm_url, video_id,
-                'Downloading video page (age confirmed)')
-            adult_content = True
-        else:
-            adult_content = False
-
-        view_count = self._html_search_regex(
-            r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>',
-            webpage, 'view count', fatal=False)
-
-        return {
-            'id': video_id,
-            'url': video_url,
-            'title': title,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'upload_date': upload_date,
-            'uploader_id': uploader_id,
-            'duration': duration,
-            'view_count': int_or_none(view_count),
-            'age_limit': 18 if adult_content else 0,
-        }
-
-
-class SmotriCommunityIE(InfoExtractor):
-    IE_DESC = 'Smotri.com community videos'
-    IE_NAME = 'smotri:community'
-    _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)'
-    _TEST = {
-        'url': 'http://smotri.com/community/video/kommuna',
-        'info_dict': {
-            'id': 'kommuna',
-        },
-        'playlist_mincount': 4,
-    }
-
-    def _real_extract(self, url):
-        community_id = self._match_id(url)
-
-        rss = self._download_xml(
-            'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id,
-            community_id, 'Downloading community RSS')
-
-        entries = [
-            self.url_result(video_url.text, SmotriIE.ie_key())
-            for video_url in rss.findall('./channel/item/link')]
-
-        return self.playlist_result(entries, community_id)
-
-
-class SmotriUserIE(InfoExtractor):
-    IE_DESC = 'Smotri.com user videos'
-    IE_NAME = 'smotri:user'
-    _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)'
-    _TESTS = [{
-        'url': 'http://smotri.com/user/inspector',
-        'info_dict': {
-            'id': 'inspector',
-            'title': 'Inspector',
-        },
-        'playlist_mincount': 9,
-    }]
-
-    def _real_extract(self, url):
-        user_id = self._match_id(url)
-
-        rss = self._download_xml(
-            'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id,
-            user_id, 'Downloading user RSS')
-
-        entries = [self.url_result(video_url.text, 'Smotri')
-                   for video_url in rss.findall('./channel/item/link')]
-
-        description_text = xpath_text(rss, './channel/description') or ''
-        user_nickname = self._search_regex(
-            '^Видео режиссера (.+)$', description_text,
-            'user nickname', fatal=False)
-
-        return self.playlist_result(entries, user_id, user_nickname)
-
-
-class SmotriBroadcastIE(InfoExtractor):
-    IE_DESC = 'Smotri.com broadcasts'
-    IE_NAME = 'smotri:broadcast'
-    _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*'
-    _NETRC_MACHINE = 'smotri'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        broadcast_id = mobj.group('id')
-
-        broadcast_url = 'http://' + mobj.group('url')
-        broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page')
-
-        if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
-            raise ExtractorError(
-                'Broadcast %s does not exist' % broadcast_id, expected=True)
-
-        # Adult content
-        if re.search('EroConfirmText">', broadcast_page) is not None:
-
-            (username, password) = self._get_login_info()
-            if username is None:
-                self.raise_login_required(
-                    'Erotic broadcasts allowed only for registered users')
-
-            login_form = {
-                'login-hint53': '1',
-                'confirm_erotic': '1',
-                'login': username,
-                'password': password,
-            }
-
-            request = sanitized_Request(
-                broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form))
-            request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            broadcast_page = self._download_webpage(
-                request, broadcast_id, 'Logging in and confirming age')
-
-            if '>Неверный логин или пароль<' in broadcast_page:
-                raise ExtractorError(
-                    'Unable to log in: bad username or password', expected=True)
-
-            adult_content = True
-        else:
-            adult_content = False
-
-        ticket = self._html_search_regex(
-            (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1',
-             r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"),
-            broadcast_page, 'broadcast ticket', group='ticket')
-
-        broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
-
-        broadcast_password = self._downloader.params.get('videopassword')
-        if broadcast_password:
-            broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
-
-        broadcast_json_page = self._download_webpage(
-            broadcast_url, broadcast_id, 'Downloading broadcast JSON')
-
-        try:
-            broadcast_json = json.loads(broadcast_json_page)
-
-            protected_broadcast = broadcast_json['_pass_protected'] == 1
-            if protected_broadcast and not broadcast_password:
-                raise ExtractorError(
-                    'This broadcast is protected by a password, use the --video-password option',
-                    expected=True)
-
-            broadcast_offline = broadcast_json['is_play'] == 0
-            if broadcast_offline:
-                raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True)
-
-            rtmp_url = broadcast_json['_server']
-            mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url)
-            if not mobj:
-                raise ExtractorError('Unexpected broadcast rtmp URL')
-
-            broadcast_playpath = broadcast_json['_streamName']
-            broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL'])
-            broadcast_thumbnail = broadcast_json.get('_imgURL')
-            broadcast_title = self._live_title(broadcast_json['title'])
-            broadcast_description = broadcast_json.get('description')
-            broadcaster_nick = broadcast_json.get('nick')
-            broadcaster_login = broadcast_json.get('login')
-            rtmp_conn = 'S:%s' % uuid.uuid4().hex
-        except KeyError:
-            if protected_broadcast:
-                raise ExtractorError('Bad broadcast password', expected=True)
-            raise ExtractorError('Unexpected broadcast JSON')
-
-        return {
-            'id': broadcast_id,
-            'url': rtmp_url,
-            'title': broadcast_title,
-            'thumbnail': broadcast_thumbnail,
-            'description': broadcast_description,
-            'uploader': broadcaster_nick,
-            'uploader_id': broadcaster_login,
-            'age_limit': 18 if adult_content else 0,
-            'ext': 'flv',
-            'play_path': broadcast_playpath,
-            'player_url': 'http://pics.smotri.com/broadcast_play.swf',
-            'app': broadcast_app,
-            'rtmp_live': True,
-            'rtmp_conn': rtmp_conn,
-            'is_live': True,
-        }
diff --git a/youtube_dlc/extractor/sonyliv.py b/youtube_dlc/extractor/sonyliv.py

index 58a8c0d4ddb2f282241af2afad37e9d4b8403085..fedfceb6289c5b1979920d2038c1b79171d63b2b 100644 (file)
--- a/youtube_dlc/extractor/sonyliv.py
+++ b/youtube_dlc/extractor/sonyliv.py
@@ -1,40 +1,112 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import time
+import uuid
+
  from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
  
  
  class SonyLIVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)'
      _TESTS = [{
-        'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",
+        'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true',
          'info_dict': {
-            'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight",
-            'id': 'ref:5024612095001',
+            'title': 'Bachelors Delight - Achaari Cheese Toast',
+            'id': '1000022678',
              'ext': 'mp4',
-            'upload_date': '20170923',
-            'description': 'md5:7f28509a148d5be9d0782b4d5106410d',
-            'uploader_id': '5182475815001',
-            'timestamp': 1506200547,
+            'upload_date': '20200411',
+            'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb',
+            'timestamp': 1586632091,
+            'duration': 185,
+            'season_number': 1,
+            'episode': 'Achaari Cheese Toast',
+            'episode_number': 1,
+            'release_year': 2016,
          },
          'params': {
              'skip_download': True,
          },
-        'add_ie': ['BrightcoveNew'],
      }, {
-        'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)',
+        'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779',
          'only_matching': True,
      }]
+    _GEO_COUNTRIES = ['IN']
+    _TOKEN = None
  
-    # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s'
-    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s'
+    def _call_api(self, version, path, video_id):
+        headers = {}
+        if self._TOKEN:
+            headers['security_token'] = self._TOKEN
+        try:
+            return self._download_json(
+                'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path),
+                video_id, headers=headers)['resultObj']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                message = self._parse_json(
+                    e.cause.read().decode(), video_id)['message']
+                if message == 'Geoblocked Country':
+                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                raise ExtractorError(message)
+            raise
+
+    def _real_initialize(self):
+        self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None)
  
      def _real_extract(self, url):
-        brightcove_id = self._match_id(url)
-        return self.url_result(
-            smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {
-                'geo_countries': ['IN'],
-                'referrer': url,
-            }),
-            'BrightcoveNew', brightcove_id)
+        video_id = self._match_id(url)
+        content = self._call_api(
+            '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id)
+        if content.get('isEncrypted'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+        dash_url = content['videoURL']
+        headers = {
+            'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000)
+        }
+        formats = self._extract_mpd_formats(
+            dash_url, video_id, mpd_id='dash', headers=headers, fatal=False)
+        formats.extend(self._extract_m3u8_formats(
+            dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'),
+            video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False))
+        for f in formats:
+            f.setdefault('http_headers', {}).update(headers)
+        self._sort_formats(formats)
+
+        metadata = self._call_api(
+            '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata']
+        title = metadata['title']
+        episode = metadata.get('episodeTitle')
+        if episode and title != episode:
+            title += ' - ' + episode
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': content.get('posterURL'),
+            'description': metadata.get('longDescription') or metadata.get('shortDescription'),
+            'timestamp': int_or_none(metadata.get('creationDate'), 1000),
+            'duration': int_or_none(metadata.get('duration')),
+            'season_number': int_or_none(metadata.get('season')),
+            'episode': episode,
+            'episode_number': int_or_none(metadata.get('episodeNumber')),
+            'release_year': int_or_none(metadata.get('year')),
+        }
diff --git a/youtube_dlc/extractor/spankbang.py b/youtube_dlc/extractor/spankbang.py

index 61ca902ce286e6274c7d5776bd10c265a023643f..37cb8c839e5e1f37f80052fe9b6a306af663a9b0 100644 (file)
--- a/youtube_dlc/extractor/spankbang.py
+++ b/youtube_dlc/extractor/spankbang.py
@@ -7,17 +7,24 @@
      determine_ext,
      ExtractorError,
      merge_dicts,
-    orderedSet,
      parse_duration,
      parse_resolution,
      str_to_int,
      url_or_none,
      urlencode_postdata,
+    urljoin,
  )
  
  
  class SpankBangIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[^/]+\.)?spankbang\.com/
+                        (?:
+                            (?P<id>[\da-z]+)/(?:video|play|embed)\b|
+                            [\da-z]+-(?P<id_2>[\da-z]+)/playlist/[^/?#&]+
+                        )
+                    '''
      _TESTS = [{
          'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
          'md5': '1cc433e1d6aa14bc376535b8679302f7',
@@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor):
      }, {
          'url': 'https://spankbang.com/2y3td/embed/',
          'only_matching': True,
+    }, {
+        'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_2')
          webpage = self._download_webpage(
              url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
              video_id, headers={'Cookie': 'country=US'})
@@ -155,30 +166,33 @@ def extract_format(format_id, format_url):
  
  
  class SpankBangPlaylistIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+'
+    _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)'
      _TEST = {
          'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
          'info_dict': {
              'id': 'ug0k',
              'title': 'Big Ass Titties',
          },
-        'playlist_mincount': 50,
+        'playlist_mincount': 40,
      }
  
      def _real_extract(self, url):
-        playlist_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        display_id = mobj.group('display_id')
  
          webpage = self._download_webpage(
              url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
  
          entries = [self.url_result(
-            'https://spankbang.com/%s/video' % video_id,
-            ie=SpankBangIE.ie_key(), video_id=video_id)
-            for video_id in orderedSet(re.findall(
-                r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))]
+            urljoin(url, mobj.group('path')),
+            ie=SpankBangIE.ie_key(), video_id=mobj.group('id'))
+            for mobj in re.finditer(
+                r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1'
+                % re.escape(display_id), webpage)]
  
          title = self._html_search_regex(
-            r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title',
+            r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title',
              fatal=False)
  
          return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dlc/extractor/sprout.py b/youtube_dlc/extractor/sprout.py

index 8467bf49df5fd7ab669c61b2aa701fd55a7b318c..e243732f2442941e5d7cbc249bb2a7a9e5cd483b 100644 (file)
--- a/youtube_dlc/extractor/sprout.py
+++ b/youtube_dlc/extractor/sprout.py
@@ -3,50 +3,62 @@
  
  from .adobepass import AdobePassIE
  from ..utils import (
-    extract_attributes,
-    update_url_query,
+    int_or_none,
      smuggle_url,
+    update_url_query,
  )
  
  
  class SproutIE(AdobePassIE):
-    _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)'
-    _TEST = {
-        'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
-        'md5': '74bf14128578d1e040c3ebc82088f45f',
+    _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race',
          'info_dict': {
-            'id': '9dexnwtmh8_X',
+            'id': 'bm0foJFaTKqb',
              'ext': 'mp4',
-            'title': 'A Cowboy Adventure',
-            'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.',
-            'timestamp': 1437758640,
-            'upload_date': '20150724',
-            'uploader': 'NBCU-SPROUT-NEW',
-        }
-    }
+            'title': 'Robot Bike Race',
+            'description': 'md5:436b1d97117cc437f54c383f4debc66d',
+            'timestamp': 1606148940,
+            'upload_date': '20201123',
+            'uploader': 'NBCU-MPAT',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.universalkids.com/watch/robot-bike-race',
+        'only_matching': True,
+    }]
+    _GEO_COUNTRIES = ['US']
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        video_component = self._search_regex(
-            r'(?s)(<div[^>]+data-component="video"[^>]*?>)',
-            webpage, 'video component', default=None)
-        if video_component:
-            options = self._parse_json(extract_attributes(
-                video_component)['data-options'], video_id)
-            theplatform_url = options['video']
-            query = {
-                'mbr': 'true',
-                'manifest': 'm3u',
-            }
-            if options.get('protected'):
-                query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout')
-            theplatform_url = smuggle_url(update_url_query(
-                theplatform_url, query), {'force_smil_url': True})
-        else:
-            iframe = self._search_regex(
-                r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)',
-                webpage, 'iframe')
-            theplatform_url = extract_attributes(iframe)['src']
-
-        return self.url_result(theplatform_url, 'ThePlatform')
+        display_id = self._match_id(url)
+        mpx_metadata = self._download_json(
+            # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/
+            'https://www.universalkids.com/_api/videos/' + display_id,
+            display_id)['mpxMetadata']
+        media_pid = mpx_metadata['mediaPid']
+        theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        if mpx_metadata.get('entitlement') == 'auth':
+            query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout')
+        theplatform_url = smuggle_url(
+            update_url_query(theplatform_url, query), {
+                'force_smil_url': True,
+                'geo_countries': self._GEO_COUNTRIES,
+            })
+        return {
+            '_type': 'url_transparent',
+            'id': media_pid,
+            'url': theplatform_url,
+            'series': mpx_metadata.get('seriesName'),
+            'season_number': int_or_none(mpx_metadata.get('seasonNumber')),
+            'episode_number': int_or_none(mpx_metadata.get('episodeNumber')),
+            'ie_key': 'ThePlatform',
+        }
diff --git a/youtube_dlc/extractor/stitcher.py b/youtube_dlc/extractor/stitcher.py

index 97d1ff6811b27140c77932a766b7cb9d3dbfe7b6..b8b5711b1b2ca43576cfdbc361e52201c9d11f28 100644 (file)
--- a/youtube_dlc/extractor/stitcher.py
+++ b/youtube_dlc/extractor/stitcher.py
@@ -4,25 +4,28 @@
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
+    clean_html,
+    ExtractorError,
      int_or_none,
-    js_to_json,
-    unescapeHTML,
+    str_or_none,
+    try_get,
  )
  
  
  class StitcherIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
+    _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
      _TESTS = [{
          'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
-        'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
+        'md5': 'e9635098e0da10b21a0e2b85585530f6',
          'info_dict': {
              'id': '40789481',
              'ext': 'mp3',
              'title': 'Machine Learning Mastery and Cancer Clusters',
-            'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
+            'description': 'md5:547adb4081864be114ae3831b4c2b42f',
              'duration': 1604,
              'thumbnail': r're:^https?://.*\.jpg',
+            'upload_date': '20180126',
+            'timestamp': 1516989316,
          },
      }, {
          'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
@@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'Page Not Found',
      }, {
          # escaped title
          'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
@@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor):
      }, {
          'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
          'only_matching': True,
+    }, {
+        'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        audio_id = mobj.group('id')
-        display_id = mobj.group('display_id') or audio_id
+        display_id, audio_id = re.match(self._VALID_URL, url).groups()
  
-        webpage = self._download_webpage(url, display_id)
+        resp = self._download_json(
+            'https://api.prod.stitcher.com/episode/' + audio_id,
+            display_id or audio_id)
+        episode = try_get(resp, lambda x: x['data']['episodes'][0], dict)
+        if not episode:
+            raise ExtractorError(resp['errors'][0]['message'], expected=True)
  
-        episode = self._parse_json(
-            js_to_json(self._search_regex(
-                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
-            display_id)['config']['episode']
+        title = episode['title'].strip()
+        audio_url = episode['audio_url']
  
-        title = unescapeHTML(episode['title'])
-        formats = [{
-            'url': episode[episode_key],
-            'ext': determine_ext(episode[episode_key]) or 'mp3',
-            'vcodec': 'none',
-        } for episode_key in ('episodeURL',) if episode.get(episode_key)]
-        description = self._search_regex(
-            r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
-        duration = int_or_none(episode.get('duration'))
-        thumbnail = episode.get('episodeImage')
+        thumbnail = None
+        show_id = episode.get('show_id')
+        if show_id and episode.get('classic_id') != -1:
+            thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id
  
          return {
              'id': audio_id,
              'display_id': display_id,
              'title': title,
-            'description': description,
-            'duration': duration,
+            'description': clean_html(episode.get('html_description') or episode.get('description')),
+            'duration': int_or_none(episode.get('duration')),
              'thumbnail': thumbnail,
-            'formats': formats,
+            'url': audio_url,
+            'vcodec': 'none',
+            'timestamp': int_or_none(episode.get('date_created')),
+            'season_number': int_or_none(episode.get('season')),
+            'season_id': str_or_none(episode.get('season_id')),
          }
diff --git a/youtube_dlc/extractor/streetvoice.py b/youtube_dlc/extractor/streetvoice.py

index 91612c7f22d260c8544cd0ead31dd830daab0424..f21681ae784ac4d4e4fd569449f2b80a4d9375f9 100644 (file)
--- a/youtube_dlc/extractor/streetvoice.py
+++ b/youtube_dlc/extractor/streetvoice.py
@@ -2,25 +2,40 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import unified_strdate
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    str_or_none,
+    strip_or_none,
+    try_get,
+    urljoin,
+)
  
  
  class StreetVoiceIE(InfoExtractor):
      _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
      _TESTS = [{
-        'url': 'http://streetvoice.com/skippylu/songs/94440/',
-        'md5': '15974627fc01a29e492c98593c2fd472',
+        'url': 'https://streetvoice.com/skippylu/songs/123688/',
+        'md5': '0eb535970629a5195685355f3ed60bfd',
          'info_dict': {
-            'id': '94440',
+            'id': '123688',
              'ext': 'mp3',
-            'title': '輸',
-            'description': 'Crispy脆樂團 - 輸',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'duration': 260,
-            'upload_date': '20091018',
+            'title': '流浪',
+            'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 270,
+            'upload_date': '20100923',
              'uploader': 'Crispy脆樂團',
              'uploader_id': '627810',
+            'uploader_url': 're:^https?://streetvoice.com/skippylu/',
+            'timestamp': 1285261661,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+            'track': '流浪',
+            'track_id': '123688',
+            'album': '2010',
          }
      }, {
          'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
@@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor):
  
      def _real_extract(self, url):
          song_id = self._match_id(url)
+        base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id
+        song = self._download_json(base_url, song_id, query={
+            'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username',
+        })
+        title = song['name']
  
-        song = self._download_json(
-            'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'')
+        formats = []
+        for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]:
+            f_url = (self._download_json(
+                base_url + suffix + '/', song_id,
+                'Downloading %s format URL' % format_id,
+                data=b'', fatal=False) or {}).get('file')
+            if not f_url:
+                continue
+            f = {
+                'ext': 'mp3',
+                'format_id': format_id,
+                'url': f_url,
+                'vcodec': 'none',
+            }
+            if format_id == 'hls':
+                f['protocol'] = 'm3u8_native'
+            abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None)
+            if abr:
+                abr = int(abr)
+                f.update({
+                    'abr': abr,
+                    'tbr': abr,
+                })
+            formats.append(f)
  
-        title = song['name']
-        author = song['user']['nickname']
+        user = song.get('user') or {}
+        username = user.get('username')
+        get_count = lambda x: int_or_none(song.get(x + '_count'))
  
          return {
              'id': song_id,
-            'url': song['file'],
+            'formats': formats,
              'title': title,
-            'description': '%s - %s' % (author, title),
-            'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
-            'duration': song.get('length'),
-            'upload_date': unified_strdate(song.get('created_at')),
-            'uploader': author,
-            'uploader_id': compat_str(song['user']['id']),
+            'description': strip_or_none(song.get('synopsis')),
+            'thumbnail': song.get('image'),
+            'duration': int_or_none(song.get('length')),
+            'timestamp': parse_iso8601(song.get('created_at')),
+            'uploader': try_get(user, lambda x: x['profile']['nickname']),
+            'uploader_id': str_or_none(user.get('id')),
+            'uploader_url': urljoin(url, '/%s/' % username) if username else None,
+            'view_count': get_count('plays'),
+            'like_count': get_count('likes'),
+            'comment_count': get_count('comments'),
+            'repost_count': get_count('share'),
+            'track': title,
+            'track_id': song_id,
+            'album': try_get(song, lambda x: x['album']['name']),
          }
diff --git a/youtube_dlc/extractor/teachable.py b/youtube_dlc/extractor/teachable.py

index a75369dbe8a3582595ae339d58887eaefd220536..2394f86d4b8433dbd443c47ad8ad459b311f69cc 100644 (file)
--- a/youtube_dlc/extractor/teachable.py
+++ b/youtube_dlc/extractor/teachable.py
@@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE):
      @staticmethod
      def _is_teachable(webpage):
          return 'teachableTracker.linker:autoLink' in webpage and re.search(
-            r'<link[^>]+href=["\']https?://process\.fs\.teachablecdn\.com',
+            r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com',
              webpage)
  
      @staticmethod
@@ -269,7 +269,7 @@ def _real_extract(self, url):
                  r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
                  webpage):
              li = mobj.group('li')
-            if 'fa-youtube-play' not in li:
+            if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li):
                  continue
              lecture_url = self._search_regex(
                  r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
diff --git a/youtube_dlc/extractor/telecinco.py b/youtube_dlc/extractor/telecinco.py

index 9ba3da341dac65d18a599a790bff9c95b0e52eb8..eecd6a5c9b692e314921754fe5ef002eee940c3f 100644 (file)
--- a/youtube_dlc/extractor/telecinco.py
+++ b/youtube_dlc/extractor/telecinco.py
@@ -5,14 +5,11 @@
  import re
  
  from .common import InfoExtractor
-from .ooyala import OoyalaIE
  from ..utils import (
      clean_html,
-    determine_ext,
      int_or_none,
      str_or_none,
      try_get,
-    urljoin,
  )
  
  
@@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor):
              'description': 'md5:716caf5601e25c3c5ab6605b1ae71529',
          },
          'playlist': [{
-            'md5': 'adb28c37238b675dad0f042292f209a7',
+            'md5': '7ee56d665cfd241c0e6d80fd175068b0',
              'info_dict': {
                  'id': 'JEA5ijCnF6p5W08A1rNKn7',
                  'ext': 'mp4',
@@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor):
          }]
      }, {
          'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
-        'md5': '9468140ebc300fbb8b9d65dc6e5c4b43',
+        'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a',
          'info_dict': {
              'id': 'jn24Od1zGLG4XUZcnUnZB6',
              'ext': 'mp4',
@@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor):
          },
      }, {
          'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
-        'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6',
+        'md5': 'eddb50291df704ce23c74821b995bcac',
          'info_dict': {
              'id': 'aywerkD2Sv1vGNqq9b85Q2',
              'ext': 'mp4',
@@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor):
  
      def _parse_content(self, content, url):
          video_id = content['dataMediaId']
-        if content.get('dataCmsId') == 'ooyala':
-            return self.url_result(
-                'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id)
-        config_url = urljoin(url, content['dataConfig'])
          config = self._download_json(
-            config_url, video_id, 'Downloading config JSON')
+            content['dataConfig'], video_id, 'Downloading config JSON')
          title = config['info']['title']
-
-        def mmc_url(mmc_type):
-            return re.sub(
-                r'/(?:flash|html5)\.json', '/%s.json' % mmc_type,
-                config['services']['mmc'])
-
-        duration = None
-        formats = []
-        for mmc_type in ('flash', 'html5'):
-            mmc = self._download_json(
-                mmc_url(mmc_type), video_id,
-                'Downloading %s mmc JSON' % mmc_type, fatal=False)
-            if not mmc:
-                continue
-            if not duration:
-                duration = int_or_none(mmc.get('duration'))
-            for location in mmc['locations']:
-                gat = self._proto_relative_url(location.get('gat'), 'http:')
-                gcp = location.get('gcp')
-                ogn = location.get('ogn')
-                if None in (gat, gcp, ogn):
-                    continue
-                token_data = {
-                    'gcp': gcp,
-                    'ogn': ogn,
-                    'sta': 0,
-                }
-                media = self._download_json(
-                    gat, video_id, data=json.dumps(token_data).encode('utf-8'),
-                    headers={
-                        'Content-Type': 'application/json;charset=utf-8',
-                        'Referer': url,
-                    }, fatal=False) or {}
-                stream = media.get('stream') or media.get('file')
-                if not stream:
-                    continue
-                ext = determine_ext(stream)
-                if ext == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
-                        stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
-                        video_id, f4m_id='hds', fatal=False))
-                elif ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        stream, video_id, 'mp4', 'm3u8_native',
-                        m3u8_id='hls', fatal=False))
+        services = config['services']
+        caronte = self._download_json(services['caronte'], video_id)
+        stream = caronte['dls'][0]['stream']
+        headers = self.geo_verification_headers()
+        headers.update({
+            'Content-Type': 'application/json;charset=UTF-8',
+            'Origin': re.match(r'https?://[^/]+', url).group(0),
+        })
+        cdn = self._download_json(
+            caronte['cerbero'], video_id, data=json.dumps({
+                'bbx': caronte['bbx'],
+                'gbx': self._download_json(services['gbx'], video_id)['gbx'],
+            }).encode(), headers=headers)['tokens']['1']['cdn']
+        formats = self._extract_m3u8_formats(
+            stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
          self._sort_formats(formats)
  
          return {
@@ -149,7 +112,7 @@ def mmc_url(mmc_type):
              'title': title,
              'formats': formats,
              'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
-            'duration': duration,
+            'duration': int_or_none(content.get('dataDuration')),
          }
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/telequebec.py b/youtube_dlc/extractor/telequebec.py

index b4c485b9be38d492401be9052fb1ddbbe461e4cc..800d87b70dfd6bbbc7790248ceac836ffefa576e 100644 (file)
--- a/youtube_dlc/extractor/telequebec.py
+++ b/youtube_dlc/extractor/telequebec.py
@@ -12,25 +12,16 @@
  
  
  class TeleQuebecBaseIE(InfoExtractor):
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+
      @staticmethod
-    def _result(url, ie_key):
+    def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'):
          return {
              '_type': 'url_transparent',
-            'url': smuggle_url(url, {'geo_countries': ['CA']}),
-            'ie_key': ie_key,
+            'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}),
+            'ie_key': 'BrightcoveNew',
          }
  
-    @staticmethod
-    def _limelight_result(media_id):
-        return TeleQuebecBaseIE._result(
-            'limelight:media:' + media_id, 'LimelightMedia')
-
-    @staticmethod
-    def _brightcove_result(brightcove_id):
-        return TeleQuebecBaseIE._result(
-            'http://players.brightcove.net/6150020952001/default_default/index.html?videoId=%s'
-            % brightcove_id, 'BrightcoveNew')
-
  
  class TeleQuebecIE(TeleQuebecBaseIE):
      _VALID_URL = r'''(?x)
@@ -44,14 +35,18 @@ class TeleQuebecIE(TeleQuebecBaseIE):
          # available till 01.01.2023
          'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane',
          'info_dict': {
-            'id': '577116881b4b439084e6b1cf4ef8b1b3',
+            'id': '6155972771001',
              'ext': 'mp4',
              'title': 'Un petit choc et puis repart!',
-            'description': 'md5:067bc84bd6afecad85e69d1000730907',
+            'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374',
+            'timestamp': 1589262469,
+            'uploader_id': '6150020952001',
+            'upload_date': '20200512',
          },
          'params': {
-            'skip_download': True,
+            'format': 'bestvideo',
          },
+        'add_ie': ['BrightcoveNew'],
      }, {
          'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout',
          'info_dict': {
@@ -65,7 +60,6 @@ class TeleQuebecIE(TeleQuebecBaseIE):
          },
          'params': {
              'format': 'bestvideo',
-            'skip_download': True,
          },
          'add_ie': ['BrightcoveNew'],
      }, {
@@ -79,25 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE):
  
      def _real_extract(self, url):
          media_id = self._match_id(url)
-
-        media_data = self._download_json(
-            'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id,
+        media = self._download_json(
+            'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id,
              media_id)['media']
-
-        source_id = media_data['streamInfo']['sourceId']
-        source = (try_get(
-            media_data, lambda x: x['streamInfo']['source'],
-            compat_str) or 'limelight').lower()
-        if source == 'brightcove':
-            info = self._brightcove_result(source_id)
-        else:
-            info = self._limelight_result(source_id)
+        source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove')
+        info = self._brightcove_result(source_id, '22gPKdt7f')
+        product = media.get('product') or {}
+        season = product.get('season') or {}
          info.update({
-            'title': media_data.get('title'),
-            'description': try_get(
-                media_data, lambda x: x['descriptions'][0]['text'], compat_str),
-            'duration': int_or_none(
-                media_data.get('durationInMilliseconds'), 1000),
+            'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str),
+            'series': try_get(season, lambda x: x['serie']['titre']),
+            'season': season.get('name'),
+            'season_number': int_or_none(season.get('seasonNo')),
+            'episode': product.get('titre'),
+            'episode_number': int_or_none(product.get('episodeNo')),
          })
          return info
  
@@ -148,7 +137,7 @@ def _real_extract(self, url):
          }
  
  
-class TeleQuebecEmissionIE(TeleQuebecBaseIE):
+class TeleQuebecEmissionIE(InfoExtractor):
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
@@ -160,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE):
      _TESTS = [{
          'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente',
          'info_dict': {
-            'id': '66648a6aef914fe3badda25e81a4d50a',
+            'id': '6154476028001',
              'ext': 'mp4',
-            'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?",
-            'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014',
-            'upload_date': '20171024',
-            'timestamp': 1508862118,
+            'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?',
+            'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f',
+            'upload_date': '20200505',
+            'timestamp': 1588713424,
+            'uploader_id': '6150020952001',
          },
          'params': {
-            'skip_download': True,
+            'format': 'bestvideo',
          },
      }, {
          'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression',
@@ -187,26 +177,26 @@ def _real_extract(self, url):
          webpage = self._download_webpage(url, display_id)
  
          media_id = self._search_regex(
-            r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage,
-            'limelight id')
+            r'mediaId\s*:\s*(?P<id>\d+)', webpage, 'media id')
  
-        info = self._limelight_result(media_id)
-        info.update({
-            'title': self._og_search_title(webpage, default=None),
-            'description': self._og_search_description(webpage, default=None),
-        })
-        return info
+        return self.url_result(
+            'http://zonevideo.telequebec.tv/media/' + media_id,
+            TeleQuebecIE.ie_key())
  
  
-class TeleQuebecLiveIE(InfoExtractor):
+class TeleQuebecLiveIE(TeleQuebecBaseIE):
      _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)'
      _TEST = {
          'url': 'http://zonevideo.telequebec.tv/endirect/',
          'info_dict': {
-            'id': 'endirect',
+            'id': '6159095684001',
              'ext': 'mp4',
-            'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
              'is_live': True,
+            'description': 'Canal principal de Télé-Québec',
+            'uploader_id': '6150020952001',
+            'timestamp': 1590439901,
+            'upload_date': '20200525',
          },
          'params': {
              'skip_download': True,
@@ -214,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor):
      }
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
+        return self._brightcove_result('6159095684001', 'skCsmi2Uw')
  
-        m3u8_url = None
-        webpage = self._download_webpage(
-            'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id,
-            fatal=False)
-        if webpage:
-            m3u8_url = self._search_regex(
-                r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
-                'm3u8 url', default=None, group='url')
-        if not m3u8_url:
-            m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8'
-        formats = self._extract_m3u8_formats(
-            m3u8_url, video_id, 'mp4', m3u8_id='hls')
-        self._sort_formats(formats)
  
-        return {
-            'id': video_id,
-            'title': self._live_title('Télé-Québec - En direct'),
-            'is_live': True,
-            'formats': formats,
-        }
+class TeleQuebecVideoIE(TeleQuebecBaseIE):
+    _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://video.telequebec.tv/player/31110/stream',
+        'info_dict': {
+            'id': '6202570652001',
+            'ext': 'mp4',
+            'title': 'Le coût du véhicule le plus vendu au Canada / Tous les frais liés à la procréation assistée',
+            'description': 'md5:685a7e4c450ba777c60adb6e71e41526',
+            'upload_date': '20201019',
+            'timestamp': 1603115930,
+            'uploader_id': '6101674910001',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }, {
+        'url': 'https://video.telequebec.tv/player-live/28527',
+        'only_matching': True,
+    }]
+
+    def _call_api(self, path, video_id):
+        return self._download_json(
+            'http://beacon.playback.api.brightcove.com/telequebec/api/assets/' + path,
+            video_id, query={'device_layout': 'web', 'device_type': 'web'})['data']
+
+    def _real_extract(self, url):
+        asset_id = self._match_id(url)
+        asset = self._call_api(asset_id, asset_id)['asset']
+        stream = self._call_api(
+            asset_id + '/streams/' + asset['streams'][0]['id'], asset_id)['stream']
+        stream_url = stream['url']
+        account_id = try_get(
+            stream, lambda x: x['video_provider_details']['account_id']) or '6101674910001'
+        info = self._brightcove_result(stream_url, 'default', account_id)
+        info.update({
+            'description': asset.get('long_description') or asset.get('short_description'),
+            'series': asset.get('series_original_name'),
+            'season_number': int_or_none(asset.get('season_number')),
+            'episode': asset.get('original_name'),
+            'episode_number': int_or_none(asset.get('episode_number')),
+        })
+        return info
diff --git a/youtube_dlc/extractor/tenplay.py b/youtube_dlc/extractor/tenplay.py

index af325fea8fcd68ce5cf9b8bb8ec33975950b5c32..cd30d57f47c73e2a046d36630c3567c790aaa948 100644 (file)
--- a/youtube_dlc/extractor/tenplay.py
+++ b/youtube_dlc/extractor/tenplay.py
@@ -3,9 +3,10 @@
  
  from .common import InfoExtractor
  from ..utils import (
+    HEADRequest,
      parse_age_limit,
      parse_iso8601,
-    smuggle_url,
+    # smuggle_url,
  )
  
  
@@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor):
              'uploader_id': '2199827728001',
          },
          'params': {
-            'format': 'bestvideo',
+            # 'format': 'bestvideo',
              'skip_download': True,
          }
      }, {
          'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc',
          'only_matching': True,
      }]
-    BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s'
+    # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s'
+    _GEO_BYPASS = False
+    _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect'
  
      def _real_extract(self, url):
          content_id = self._match_id(url)
@@ -40,19 +43,28 @@ def _real_extract(self, url):
          video = data.get('video') or {}
          metadata = data.get('metaData') or {}
          brightcove_id = video.get('videoId') or metadata['showContentVideoId']
-        brightcove_url = smuggle_url(
-            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
-            {'geo_countries': ['AU']})
+        # brightcove_url = smuggle_url(
+        #     self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+        #     {'geo_countries': ['AU']})
+        m3u8_url = self._request_webpage(HEADRequest(
+            self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl()
+        if '10play-not-in-oz' in m3u8_url:
+            self.raise_geo_restricted(countries=['AU'])
+        formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4')
+        self._sort_formats(formats)
  
          return {
-            '_type': 'url_transparent',
-            'url': brightcove_url,
-            'id': content_id,
-            'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'),
+            # '_type': 'url_transparent',
+            # 'url': brightcove_url,
+            'formats': formats,
+            'id': brightcove_id,
+            'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'],
              'description': video.get('description'),
              'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')),
              'series': metadata.get('showName'),
              'season': metadata.get('showContentSeason'),
              'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')),
-            'ie_key': 'BrightcoveNew',
+            'thumbnail': video.get('poster'),
+            'uploader_id': '2199827728001',
+            # 'ie_key': 'BrightcoveNew',
          }
diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py

index 41bfbe80f48c7fa41638aadd53fe8769a783c6fe..adfe11e314212e9c9e50491c747f3560a0e32bfe 100644 (file)
--- a/youtube_dlc/extractor/theplatform.py
+++ b/youtube_dlc/extractor/theplatform.py
@@ -234,6 +234,9 @@ def hex_to_bytes(hex):
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
  
          mobj = re.match(self._VALID_URL, url)
          provider_id = mobj.group('provider_id')
diff --git a/youtube_dlc/extractor/theweatherchannel.py b/youtube_dlc/extractor/theweatherchannel.py

index c34a49d0368583f075808242ecf80a8752899b95..b2a8c3797e9ebe43da451a39da5ab3d56c043aa6 100644 (file)
--- a/youtube_dlc/extractor/theweatherchannel.py
+++ b/youtube_dlc/extractor/theweatherchannel.py
@@ -1,18 +1,22 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import json
+import re
+
  from .theplatform import ThePlatformIE
  from ..utils import (
      determine_ext,
      parse_duration,
+    parse_iso8601,
  )
  
  
  class TheWeatherChannelIE(ThePlatformIE):
-    _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?weather\.com(?P<asset_name>(?:/(?P<locale>[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P<id>[^/?#]+))'
      _TESTS = [{
          'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock',
-        'md5': 'ab924ac9574e79689c24c6b95e957def',
+        'md5': 'c4cbe74c9c17c5676b704b950b73dd92',
          'info_dict': {
              'id': 'cc82397e-cc3f-4d11-9390-a785add090e8',
              'ext': 'mp4',
@@ -20,18 +24,33 @@ class TheWeatherChannelIE(ThePlatformIE):
              'description': 'md5:55606ce1378d4c72e6545e160c9d9695',
              'uploader': 'TWC - Digital (No Distro)',
              'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c',
+            'upload_date': '20160720',
+            'timestamp': 1469018835,
          }
+    }, {
+        'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        drupal_settings = self._parse_json(self._search_regex(
-            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
-            webpage, 'drupal settings'), display_id)
-        video_id = drupal_settings['twc']['contexts']['node']['uuid']
-        video_data = self._download_json(
-            'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id)
+        asset_name, locale, display_id = re.match(self._VALID_URL, url).groups()
+        if not locale:
+            locale = 'en-US'
+        video_data = list(self._download_json(
+            'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{
+                'name': 'getCMSAssetsUrlConfig',
+                'params': {
+                    'language': locale.replace('-', '_'),
+                    'query': {
+                        'assetName': {
+                            '$in': asset_name,
+                        },
+                    },
+                }
+            }]).encode(), headers={
+                'Content-Type': 'application/json',
+            })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0]
+        video_id = video_data['id']
          seo_meta = video_data.get('seometa', {})
          title = video_data.get('title') or seo_meta['title']
  
@@ -66,6 +85,8 @@ def _real_extract(self, url):
                  })
          self._sort_formats(formats)
  
+        cc_url = video_data.get('cc_url')
+
          return {
              'id': video_id,
              'display_id': display_id,
@@ -74,6 +95,8 @@ def _real_extract(self, url):
              'duration': parse_duration(video_data.get('duration')),
              'uploader': video_data.get('providername'),
              'uploader_id': video_data.get('providerid'),
+            'timestamp': parse_iso8601(video_data.get('publishdate')),
+            'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None,
              'thumbnails': thumbnails,
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/toggle.py b/youtube_dlc/extractor/toggle.py

index ca2e36efe4216ad66d46252c662ea4cc5395c3ca..270c84daa15070f640f0289c0971624e6d2a70d3 100644 (file)
--- a/youtube_dlc/extractor/toggle.py
+++ b/youtube_dlc/extractor/toggle.py
@@ -11,13 +11,13 @@
      float_or_none,
      int_or_none,
      parse_iso8601,
-    sanitized_Request,
+    strip_or_none,
  )
  
  
  class ToggleIE(InfoExtractor):
      IE_NAME = 'toggle'
-    _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
+    _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)'
      _TESTS = [{
          'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
          'info_dict': {
@@ -84,28 +84,12 @@ class ToggleIE(InfoExtractor):
          'only_matching': True,
      }]
  
-    _FORMAT_PREFERENCES = {
-        'wvm-STBMain': -10,
-        'wvm-iPadMain': -20,
-        'wvm-iPhoneMain': -30,
-        'wvm-Android': -40,
-    }
      _API_USER = 'tvpapi_147'
      _API_PASS = '11111'
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        webpage = self._download_webpage(
-            url, video_id, note='Downloading video page')
-
-        api_user = self._search_regex(
-            r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser',
-            default=self._API_USER, group='user')
-        api_pass = self._search_regex(
-            r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass',
-            default=self._API_PASS, group='pass')
-
          params = {
              'initObj': {
                  'Locale': {
@@ -118,17 +102,16 @@ def _real_extract(self, url):
                  'SiteGuid': 0,
                  'DomainID': '0',
                  'UDID': '',
-                'ApiUser': api_user,
-                'ApiPass': api_pass
+                'ApiUser': self._API_USER,
+                'ApiPass': self._API_PASS
              },
              'MediaID': video_id,
              'mediaType': 0,
          }
  
-        req = sanitized_Request(
+        info = self._download_json(
              'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo',
-            json.dumps(params).encode('utf-8'))
-        info = self._download_json(req, video_id, 'Downloading video info json')
+            video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8'))
  
          title = info['MediaName']
  
@@ -141,11 +124,16 @@ def _real_extract(self, url):
              vid_format = vid_format.replace(' ', '')
              # if geo-restricted, m3u8 is inaccessible, but mp4 is okay
              if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_formats = self._extract_m3u8_formats(
                      video_url, video_id, ext='mp4', m3u8_id=vid_format,
                      note='Downloading %s m3u8 information' % vid_format,
                      errnote='Failed to download %s m3u8 information' % vid_format,
-                    fatal=False))
+                    fatal=False)
+                for f in m3u8_formats:
+                    # Apple FairPlay Streaming
+                    if '/fpshls/' in f['url']:
+                        continue
+                    formats.append(f)
              elif ext == 'mpd':
                  formats.extend(self._extract_mpd_formats(
                      video_url, video_id, mpd_id=vid_format,
@@ -158,28 +146,21 @@ def _real_extract(self, url):
                      note='Downloading %s ISM manifest' % vid_format,
                      errnote='Failed to download %s ISM manifest' % vid_format,
                      fatal=False))
-            elif ext in ('mp4', 'wvm'):
-                # wvm are drm-protected files
+            elif ext == 'mp4':
                  formats.append({
                      'ext': ext,
                      'url': video_url,
                      'format_id': vid_format,
-                    'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1,
-                    'format_note': 'DRM-protected video' if ext == 'wvm' else None
                  })
          if not formats:
+            for meta in (info.get('Metas') or []):
+                if meta.get('Key') == 'Encryption' and meta.get('Value') == '1':
+                    raise ExtractorError(
+                        'This video is DRM protected.', expected=True)
              # Most likely because geo-blocked
              raise ExtractorError('No downloadable videos found', expected=True)
          self._sort_formats(formats)
  
-        duration = int_or_none(info.get('Duration'))
-        description = info.get('Description')
-        created_at = parse_iso8601(info.get('CreationDate') or None)
-
-        average_rating = float_or_none(info.get('Rating'))
-        view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter'))
-        like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter'))
-
          thumbnails = []
          for picture in info.get('Pictures', []):
              if not isinstance(picture, dict):
@@ -199,15 +180,55 @@ def _real_extract(self, url):
                  })
              thumbnails.append(thumbnail)
  
+        def counter(prefix):
+            return int_or_none(
+                info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter'))
+
          return {
              'id': video_id,
              'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': created_at,
-            'average_rating': average_rating,
-            'view_count': view_count,
-            'like_count': like_count,
+            'description': strip_or_none(info.get('Description')),
+            'duration': int_or_none(info.get('Duration')),
+            'timestamp': parse_iso8601(info.get('CreationDate') or None),
+            'average_rating': float_or_none(info.get('Rating')),
+            'view_count': counter('View'),
+            'like_count': counter('Like'),
              'thumbnails': thumbnails,
              'formats': formats,
          }
+
+
+class MeWatchIE(InfoExtractor):
+    IE_NAME = 'mewatch'
+    _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371',
+        'info_dict': {
+            'id': '1008625',
+            'ext': 'mp4',
+            'title': 'Recipe Of Life 味之道',
+            'timestamp': 1603306526,
+            'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c',
+            'upload_date': '20201021',
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        },
+    }, {
+        'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232',
+        'only_matching': True,
+    }, {
+        'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+        custom_id = self._download_json(
+            'https://cdn.mewatch.sg/api/items/' + item_id,
+            item_id, query={'segments': 'all'})['customId']
+        return self.url_result(
+            'toggle:' + custom_id, ToggleIE.ie_key(), custom_id)
diff --git a/youtube_dlc/extractor/tubitv.py b/youtube_dlc/extractor/tubitv.py

index a51fa6515e6e09d4e2b1794c475f8c251eaf2385..ebfb05c636dffefdd105a52290048bf01d5b9ce6 100644 (file)
--- a/youtube_dlc/extractor/tubitv.py
+++ b/youtube_dlc/extractor/tubitv.py
@@ -33,6 +33,19 @@ class TubiTvIE(InfoExtractor):
      }, {
          'url': 'http://tubitv.com/movies/383676/tracker',
          'only_matching': True,
+    }, {
+        'url': 'https://tubitv.com/movies/560057/penitentiary?start=true',
+        'info_dict': {
+            'id': '560057',
+            'ext': 'mp4',
+            'title': 'Penitentiary',
+            'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9',
+            'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2',
+            'release_year': 1979,
+        },
+        'params': {
+            'skip_download': True,
+        },
      }]
  
      def _login(self):
@@ -93,4 +106,5 @@ def _real_extract(self, url):
              'description': video_data.get('description'),
              'duration': int_or_none(video_data.get('duration')),
              'uploader_id': video_data.get('publisher_id'),
+            'release_year': int_or_none(video_data.get('year')),
          }
diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py

index 2964504a28d27e71ac97a89c0cbee380dfe9e4b7..81229a54be34a90af845ce0b0f142321ea5ad691 100644 (file)
--- a/youtube_dlc/extractor/turner.py
+++ b/youtube_dlc/extractor/turner.py
@@ -6,6 +6,7 @@
  from .adobepass import AdobePassIE
  from ..compat import compat_str
  from ..utils import (
+    fix_xml_ampersands,
      xpath_text,
      int_or_none,
      determine_ext,
@@ -49,8 +50,13 @@ def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, c
              self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token
          return video_url + '?hdnea=' + token
  
-    def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
-        video_data = self._download_xml(data_src, video_id)
+    def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False):
+        video_data = self._download_xml(
+            data_src, video_id,
+            transform_source=lambda s: fix_xml_ampersands(s).strip(),
+            fatal=fatal)
+        if not video_data:
+            return {}
          video_id = video_data.attrib['id']
          title = xpath_text(video_data, 'headline', fatal=True)
          content_id = xpath_text(video_data, 'contentId') or video_id
@@ -63,12 +69,14 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
  
          urls = []
          formats = []
+        thumbnails = []
+        subtitles = {}
          rex = re.compile(
              r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?')
          # Possible formats locations: files/file, files/groupFiles/files
          # and maybe others
          for video_file in video_data.findall('.//file'):
-            video_url = video_file.text.strip()
+            video_url = url_or_none(video_file.text.strip())
              if not video_url:
                  continue
              ext = determine_ext(video_url)
@@ -108,9 +116,28 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
                  continue
              urls.append(video_url)
              format_id = video_file.get('bitrate')
-            if ext == 'smil':
+            if ext in ('scc', 'srt', 'vtt'):
+                subtitles.setdefault('en', []).append({
+                    'ext': ext,
+                    'url': video_url,
+                })
+            elif ext == 'png':
+                thumbnails.append({
+                    'id': format_id,
+                    'url': video_url,
+                })
+            elif ext == 'smil':
                  formats.extend(self._extract_smil_formats(
                      video_url, video_id, fatal=False))
+            elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url):
+                formats.extend(self._extract_akamai_formats(
+                    video_url, video_id, {
+                        'hds': path_data.get('f4m', {}).get('host'),
+                        # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com
+                        # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com
+                        # ssl.cdn.turner.com
+                        'http': 'pmd.cdn.turner.com',
+                    }))
              elif ext == 'm3u8':
                  m3u8_formats = self._extract_m3u8_formats(
                      video_url, video_id, 'mp4',
@@ -129,7 +156,7 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
                      'url': video_url,
                      'ext': ext,
                  }
-                mobj = rex.search(format_id + video_url)
+                mobj = rex.search(video_url)
                  if mobj:
                      f.update({
                          'width': int(mobj.group('width')),
@@ -152,7 +179,6 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
                  formats.append(f)
          self._sort_formats(formats)
  
-        subtitles = {}
          for source in video_data.findall('closedCaptions/source'):
              for track in source.findall('track'):
                  track_url = url_or_none(track.get('url'))
@@ -168,12 +194,12 @@ def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
                      }.get(source.get('format'))
                  })
  
-        thumbnails = [{
-            'id': image.get('cut'),
+        thumbnails.extend({
+            'id': image.get('cut') or image.get('name'),
              'url': image.text,
              'width': int_or_none(image.get('width')),
              'height': int_or_none(image.get('height')),
-        } for image in video_data.findall('images/image')]
+        } for image in video_data.findall('images/image'))
  
          is_live = xpath_text(video_data, 'isLive') == 'true'
  
diff --git a/youtube_dlc/extractor/tv5unis.py b/youtube_dlc/extractor/tv5unis.py

new file mode 100644 (file)

index 0000000..eabdc22
--- /dev/null
+++ b/youtube_dlc/extractor/tv5unis.py
@@ -0,0 +1,121 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    smuggle_url,
+    try_get,
+)
+
+
+class TV5UnisBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['CA']
+
+    def _real_extract(self, url):
+        groups = re.match(self._VALID_URL, url).groups()
+        product = self._download_json(
+            'https://api.tv5unis.ca/graphql', groups[0], query={
+                'query': '''{
+  %s(%s) {
+    collection {
+      title
+    }
+    episodeNumber
+    rating {
+      name
+    }
+    seasonNumber
+    tags
+    title
+    videoElement {
+      ... on Video {
+        mediaId
+      }
+    }
+  }
+}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)),
+            })['data'][self._GQL_QUERY_NAME]
+        media_id = product['videoElement']['mediaId']
+
+        return {
+            '_type': 'url_transparent',
+            'id': media_id,
+            'title': product.get('title'),
+            'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}),
+            'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])),
+            'tags': product.get('tags'),
+            'series': try_get(product, lambda x: x['collection']['title']),
+            'season_number': int_or_none(product.get('seasonNumber')),
+            'episode_number': int_or_none(product.get('episodeNumber')),
+            'ie_key': 'LimelightMedia',
+        }
+
+
+class TV5UnisVideoIE(TV5UnisBaseIE):
+    IE_NAME = 'tv5unis:video'
+    _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843',
+        'md5': '3d794164928bda97fb87a17e89923d9b',
+        'info_dict': {
+            'id': 'a883684aecb2486cad9bdc7bbe17f861',
+            'ext': 'mp4',
+            'title': 'Watatatow',
+            'duration': 10.01,
+        }
+    }
+    _GQL_QUERY_NAME = 'productById'
+
+    @staticmethod
+    def _gql_args(groups):
+        return 'id: %s' % groups
+
+
+class TV5UnisIE(TV5UnisBaseIE):
+    IE_NAME = 'tv5unis'
+    _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)'
+    _TESTS = [{
+        'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1',
+        'md5': 'a479907d2e531a73e1f8dc48d6388d02',
+        'info_dict': {
+            'id': 'e5ee23a586c44612a56aad61accf16ef',
+            'ext': 'mp4',
+            'title': 'Je ne peux pas lui résister',
+            'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...",
+            'subtitles': {
+                'fr': 'count:1',
+            },
+            'duration': 1370,
+            'age_limit': 8,
+            'tags': 'count:3',
+            'series': 'Watatatow',
+            'season_number': 6,
+            'episode_number': 1,
+        },
+    }, {
+        'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny',
+        'md5': '9ca80ebb575c681d10cae1adff3d4774',
+        'info_dict': {
+            'id': '726188eefe094d8faefb13381d42bc06',
+            'ext': 'mp4',
+            'title': 'Le voyage de Fanny',
+            'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.",
+            'subtitles': {
+                'fr': 'count:1',
+            },
+            'duration': 5587.034,
+            'tags': 'count:4',
+        },
+    }]
+    _GQL_QUERY_NAME = 'productByRootProductSlug'
+
+    @staticmethod
+    def _gql_args(groups):
+        args = 'rootProductSlug: "%s"' % groups[0]
+        if groups[1]:
+            args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:]
+        return args
diff --git a/youtube_dlc/extractor/tva.py b/youtube_dlc/extractor/tva.py

index 443f46e8a3537165d620c2db8863634e9f922ab6..52a4ddf32febc39a710f81dda5951584870e82c0 100644 (file)
--- a/youtube_dlc/extractor/tva.py
+++ b/youtube_dlc/extractor/tva.py
@@ -4,7 +4,9 @@
  from .common import InfoExtractor
  from ..utils import (
      float_or_none,
+    int_or_none,
      smuggle_url,
+    strip_or_none,
  )
  
  
@@ -23,7 +25,8 @@ class TVAIE(InfoExtractor):
          'params': {
              # m3u8 download
              'skip_download': True,
-        }
+        },
+        'skip': 'HTTP Error 404: Not Found',
      }, {
          'url': 'https://video.tva.ca/details/_5596811470001',
          'only_matching': True,
@@ -32,26 +35,54 @@ class TVAIE(InfoExtractor):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        video_data = self._download_json(
-            'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={
-                'Accept': 'application/json',
-            }, query={
-                'appId': '5955fc5f23eec60006c951f1',
-            })
-
-        def get_attribute(key):
-            for attribute in video_data.get('attributes', []):
-                if attribute.get('key') == key:
-                    return attribute.get('value')
-            return None
  
          return {
              '_type': 'url_transparent',
              'id': video_id,
-            'title': get_attribute('title'),
              'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}),
-            'description': get_attribute('description'),
-            'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'),
-            'duration': float_or_none(get_attribute('video-duration'), 1000),
              'ie_key': 'BrightcoveNew',
          }
+
+
+class QubIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619',
+        'md5': '949490fd0e7aee11d0543777611fbd53',
+        'info_dict': {
+            'id': '6084352463001',
+            'ext': 'mp4',
+            'title': 'Épisode 01',
+            'uploader_id': '5481942443001',
+            'upload_date': '20190907',
+            'timestamp': 1567899756,
+            'description': 'md5:9c0d7fbb90939420c651fd977df90145',
+        },
+    }, {
+        'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
+        'only_matching': True,
+    }]
+    # reference_id also works with old account_id(5481942443001)
+    # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s'
+
+    def _real_extract(self, url):
+        entity_id = self._match_id(url)
+        entity = self._download_json(
+            'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities',
+            entity_id, query={'id': entity_id})
+        video_id = entity['videoId']
+        episode = strip_or_none(entity.get('name'))
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': episode,
+            # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'],
+            'url': 'https://videos.tva.ca/details/_' + video_id,
+            'description': entity.get('longDescription'),
+            'duration': float_or_none(entity.get('durationMillis'), 1000),
+            'episode': episode,
+            'episode_number': int_or_none(entity.get('episodeNumber')),
+            # 'ie_key': 'BrightcoveNew',
+            'ie_key': TVAIE.ie_key(),
+        }
diff --git a/youtube_dlc/extractor/tver.py b/youtube_dlc/extractor/tver.py

new file mode 100644 (file)

index 0000000..931d4d6
--- /dev/null
+++ b/youtube_dlc/extractor/tver.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    remove_start,
+    smuggle_url,
+    try_get,
+)
+
+
+class TVerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))'
+    # videos are only available for 7 days
+    _TESTS = [{
+        'url': 'https://tver.jp/corner/f0062178',
+        'only_matching': True,
+    }, {
+        'url': 'https://tver.jp/feature/f0062413',
+        'only_matching': True,
+    }, {
+        'url': 'https://tver.jp/episode/79622438',
+        'only_matching': True,
+    }]
+    _TOKEN = None
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+    def _real_initialize(self):
+        self._TOKEN = self._download_json(
+            'https://tver.jp/api/access_token.php', None)['token']
+
+    def _real_extract(self, url):
+        path, video_id = re.match(self._VALID_URL, url).groups()
+        main = self._download_json(
+            'https://api.tver.jp/v4/' + path, video_id,
+            query={'token': self._TOKEN})['main']
+        p_id = main['publisher_id']
+        service = remove_start(main['service'], 'ts_')
+        info = {
+            '_type': 'url_transparent',
+            'description': try_get(main, lambda x: x['note'][0]['text'], compat_str),
+            'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])),
+        }
+
+        if service == 'cx':
+            info.update({
+                'title': main.get('subtitle') or main['title'],
+                'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id),
+                'ie_key': 'FujiTVFODPlus7',
+            })
+        else:
+            r_id = main['reference_id']
+            if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
+                r_id = 'ref:' + r_id
+            bc_url = smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id),
+                {'geo_countries': ['JP']})
+            info.update({
+                'url': bc_url,
+                'ie_key': 'BrightcoveNew',
+            })
+
+        return info
diff --git a/youtube_dlc/extractor/tvplay.py b/youtube_dlc/extractor/tvplay.py

index 3c2450dd0c8733d3a96a1a842c54505294b70ca6..0d858c02599670fa76a1760314c3ede4af672d2d 100644 (file)
--- a/youtube_dlc/extractor/tvplay.py
+++ b/youtube_dlc/extractor/tvplay.py
@@ -12,11 +12,13 @@
      determine_ext,
      ExtractorError,
      int_or_none,
+    parse_duration,
      parse_iso8601,
      qualities,
      try_get,
      update_url_query,
      url_or_none,
+    urljoin,
  )
  
  
@@ -414,7 +416,7 @@ def _real_extract(self, url):
  
  
  class TVPlayHomeIE(InfoExtractor):
-    _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)'
      _TESTS = [{
          'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/',
          'info_dict': {
@@ -433,80 +435,58 @@ class TVPlayHomeIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
-        'add_ie': [TVPlayIE.ie_key()],
      }, {
          'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/',
          'only_matching': True,
      }, {
          'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/',
          'only_matching': True,
+    }, {
+        'url': 'https://play.tv3.lt/aferistai-10047125',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317',
+        'only_matching': True,
+    }, {
+        'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        webpage = self._download_webpage(url, video_id)
-
-        video_id = self._search_regex(
-            r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id')
-
-        if len(video_id) < 8:
-            return self.url_result(
-                'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id)
+        asset = self._download_json(
+            urljoin(url, '/sb/public/asset/' + video_id), video_id)
  
-        m3u8_url = self._search_regex(
-            r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
-            'm3u8 url', group='url')
+        m3u8_url = asset['movie']['contentUrl']
+        video_id = asset['assetId']
+        asset_title = asset['title']
+        title = asset_title['title']
  
          formats = self._extract_m3u8_formats(
-            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
-            m3u8_id='hls')
+            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
          self._sort_formats(formats)
  
-        title = self._search_regex(
-            r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
-            'title', default=None, group='value') or self._html_search_meta(
-            'title', webpage, default=None) or self._og_search_title(
-            webpage)
-
-        description = self._html_search_meta(
-            'description', webpage,
-            default=None) or self._og_search_description(webpage)
-
-        thumbnail = self._search_regex(
-            r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
-            'thumbnail', default=None, group='url') or self._html_search_meta(
-            'thumbnail', webpage, default=None) or self._og_search_thumbnail(
-            webpage)
-
-        duration = int_or_none(self._search_regex(
-            r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration',
-            fatal=False))
+        thumbnails = None
+        image_url = asset.get('imageUrl')
+        if image_url:
+            thumbnails = [{
+                'url': urljoin(url, image_url),
+                'ext': 'jpg',
+            }]
  
-        season = self._search_regex(
-            (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1',
-             r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
-            'season', default=None, group='value')
-        season_number = int_or_none(self._search_regex(
-            r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number',
-            default=None))
-        episode = self._search_regex(
-            (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
-             r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
-            'episode', default=None, group='value')
-        episode_number = int_or_none(self._search_regex(
-            r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number',
-            default=None))
+        metadata = asset.get('metadata') or {}
  
          return {
              'id': video_id,
              'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'season': season,
-            'season_number': season_number,
-            'episode': episode,
-            'episode_number': episode_number,
+            'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'),
+            'thumbnails': thumbnails,
+            'duration': parse_duration(asset_title.get('runTime')),
+            'series': asset.get('tvSeriesTitle'),
+            'season': asset.get('tvSeasonTitle'),
+            'season_number': int_or_none(metadata.get('seasonNumber')),
+            'episode': asset_title.get('titleBrief'),
+            'episode_number': int_or_none(metadata.get('episodeNumber')),
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/twitcasting.py b/youtube_dlc/extractor/twitcasting.py

index 2dbe89f5bc5834a4204ab8638aafd3f9520c1de5..6596eef9f863c022bc7b9219d0882bec046b0fba 100644 (file)
--- a/youtube_dlc/extractor/twitcasting.py
+++ b/youtube_dlc/extractor/twitcasting.py
@@ -1,11 +1,20 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-from .common import InfoExtractor
-from ..utils import urlencode_postdata
-
  import re
  
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    float_or_none,
+    get_element_by_class,
+    get_element_by_id,
+    parse_duration,
+    str_to_int,
+    unified_timestamp,
+    urlencode_postdata,
+)
+
  
  class TwitCastingIE(InfoExtractor):
      _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
@@ -17,8 +26,12 @@ class TwitCastingIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'Live #2357609',
              'uploader_id': 'ivetesangalo',
-            'description': "Moi! I'm live on TwitCasting from my iPhone.",
+            'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.',
              'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20110822',
+            'timestamp': 1314010824,
+            'duration': 32,
+            'view_count': int,
          },
          'params': {
              'skip_download': True,
@@ -30,8 +43,12 @@ class TwitCastingIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'Live playing something #3689740',
              'uploader_id': 'mttbernardini',
-            'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)",
+            'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.',
              'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20120212',
+            'timestamp': 1329028024,
+            'duration': 681,
+            'view_count': int,
          },
          'params': {
              'skip_download': True,
@@ -40,9 +57,7 @@ class TwitCastingIE(InfoExtractor):
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        uploader_id = mobj.group('uploader_id')
+        uploader_id, video_id = re.match(self._VALID_URL, url).groups()
  
          video_password = self._downloader.params.get('videopassword')
          request_data = None
@@ -52,30 +67,45 @@ def _real_extract(self, url):
              })
          webpage = self._download_webpage(url, video_id, data=request_data)
  
-        title = self._html_search_regex(
-            r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</',
-            webpage, 'title', default=None) or self._html_search_meta(
-            'twitter:title', webpage, fatal=True)
+        title = clean_html(get_element_by_id(
+            'movietitle', webpage)) or self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, fatal=True)
  
+        video_js_data = {}
          m3u8_url = self._search_regex(
-            (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
-             r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'),
-            webpage, 'm3u8 url', group='url')
+            r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'm3u8 url', group='url', default=None)
+        if not m3u8_url:
+            video_js_data = self._parse_json(self._search_regex(
+                r"data-movie-playlist='(\[[^']+\])'",
+                webpage, 'movie playlist'), video_id)[0]
+            m3u8_url = video_js_data['source']['url']
  
+        # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
          formats = self._extract_m3u8_formats(
-            m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
-            m3u8_id='hls')
+            m3u8_url, video_id, 'mp4', m3u8_id='hls')
  
-        thumbnail = self._og_search_thumbnail(webpage)
-        description = self._og_search_description(
-            webpage, default=None) or self._html_search_meta(
-            'twitter:description', webpage)
+        thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
+        description = clean_html(get_element_by_id(
+            'authorcomment', webpage)) or self._html_search_meta(
+            ['description', 'og:description', 'twitter:description'], webpage)
+        duration = float_or_none(video_js_data.get(
+            'duration'), 1000) or parse_duration(clean_html(
+                get_element_by_class('tw-player-duration-time', webpage)))
+        view_count = str_to_int(self._search_regex(
+            r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None))
+        timestamp = unified_timestamp(self._search_regex(
+            r'data-toggle="true"[^>]+datetime="([^"]+)"',
+            webpage, 'datetime', None))
  
          return {
              'id': video_id,
              'title': title,
              'description': description,
              'thumbnail': thumbnail,
+            'timestamp': timestamp,
              'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': view_count,
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/uktvplay.py b/youtube_dlc/extractor/uktvplay.py

index 2137502a12cacac030b71ff7874d9c935eb2847f..f28fd514db66217a99800657ae990829d240d71b 100644 (file)
--- a/youtube_dlc/extractor/uktvplay.py
+++ b/youtube_dlc/extractor/uktvplay.py
@@ -5,10 +5,9 @@
  
  
  class UKTVPlayIE(InfoExtractor):
-    _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)'
+    _TESTS = [{
          'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001',
-        'md5': '',
          'info_dict': {
              'id': '2117008346001',
              'ext': 'mp4',
@@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor):
              'skip_download': True,
          },
          'expected_warnings': ['Failed to download MPD manifest']
-    }
+    }, {
+        'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001',
+        'only_matching': True,
+    }]
+    # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s'
      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s'
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/videa.py b/youtube_dlc/extractor/videa.py

index a03614cc10918301dbd3196de2123b0f22b2a8b8..ab2c15cdec154bf7d760c588d114eed106221b60 100644 (file)
--- a/youtube_dlc/extractor/videa.py
+++ b/youtube_dlc/extractor/videa.py
@@ -1,10 +1,9 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-import re
  import random
+import re
  import string
-import struct
  
  from .common import InfoExtractor
  from ..utils import (
@@ -12,13 +11,14 @@
      int_or_none,
      mimetype2ext,
      parse_codecs,
+    update_url_query,
      xpath_element,
      xpath_text,
  )
  from ..compat import (
      compat_b64decode,
      compat_ord,
-    compat_parse_qs,
+    compat_struct_pack,
  )
  
  
@@ -28,7 +28,7 @@ class VideaIE(InfoExtractor):
                          videa(?:kid)?\.hu/
                          (?:
                              videok/(?:[^/]+/)*[^?#&]+-|
-                            player\?.*?\bv=|
+                            (?:videojs_)?player\?.*?\bv=|
                              player/v/
                          )
                          (?P<id>[^?#&]+)
@@ -62,6 +62,7 @@ class VideaIE(InfoExtractor):
          'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
          'only_matching': True,
      }]
+    _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
  
      @staticmethod
      def _extract_urls(webpage):
@@ -69,75 +70,84 @@ def _extract_urls(webpage):
              r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
              webpage)]
  
-    def rc4(self, ciphertext, key):
+    @staticmethod
+    def rc4(cipher_text, key):
          res = b''
  
-        keyLen = len(key)
+        key_len = len(key)
          S = list(range(256))
  
          j = 0
          for i in range(256):
-            j = (j + S[i] + ord(key[i % keyLen])) % 256
+            j = (j + S[i] + ord(key[i % key_len])) % 256
              S[i], S[j] = S[j], S[i]
  
          i = 0
          j = 0
-        for m in range(len(ciphertext)):
+        for m in range(len(cipher_text)):
              i = (i + 1) % 256
              j = (j + S[i]) % 256
              S[i], S[j] = S[j], S[i]
              k = S[(S[i] + S[j]) % 256]
-            res += struct.pack("B", k ^ compat_ord(ciphertext[m]))
+            res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m]))
  
-        return res
+        return res.decode()
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id, fatal=True)
-        error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None)
-        if error:
-            raise ExtractorError(error, expected=True)
-
-        video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params')
-        video_src_params = compat_parse_qs(video_src_params_raw)
-        player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True)
-        nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce')
-        random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8))
-        static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
+        query = {'v': video_id}
+        player_page = self._download_webpage(
+            'https://videa.hu/player', video_id, query=query)
+
+        nonce = self._search_regex(
+            r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce')
          l = nonce[:32]
          s = nonce[32:]
          result = ''
          for i in range(0, 32):
-            result += s[i - (static_secret.index(l[i]) - 31)]
-
-        video_src_params['_s'] = random_seed
-        video_src_params['_t'] = result[:16]
-        encryption_key_stem = result[16:] + random_seed
-
-        [b64_info, handle] = self._download_webpage_handle(
-            'http://videa.hu/videaplayer_get_xml.php', video_id,
-            query=video_src_params, fatal=True)
-
-        encrypted_info = compat_b64decode(b64_info)
-        key = encryption_key_stem + handle.info()['x-videa-xs']
-        info_str = self.rc4(encrypted_info, key).decode('utf8')
-        info = self._parse_xml(info_str, video_id)
-
-        video = xpath_element(info, './/video', 'video', fatal=True)
-        sources = xpath_element(info, './/video_sources', 'sources', fatal=True)
-        hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True)
+            result += s[i - (self._STATIC_SECRET.index(l[i]) - 31)]
+
+        random_seed = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(8))
+        query['_s'] = random_seed
+        query['_t'] = result[:16]
+
+        b64_info, handle = self._download_webpage_handle(
+            'http://videa.hu/videaplayer_get_xml.php', video_id, query=query)
+        if b64_info.startswith('<?xml'):
+            info = self._parse_xml(b64_info, video_id)
+        else:
+            key = result[16:] + random_seed + handle.headers['x-videa-xs']
+            info = self._parse_xml(self.rc4(
+                compat_b64decode(b64_info), key), video_id)
+
+        video = xpath_element(info, './video', 'video')
+        if not video:
+            raise ExtractorError(xpath_element(
+                info, './error', fatal=True), expected=True)
+        sources = xpath_element(
+            info, './video_sources', 'sources', fatal=True)
+        hash_values = xpath_element(
+            info, './hash_values', 'hash values', fatal=True)
  
          title = xpath_text(video, './title', fatal=True)
  
          formats = []
          for source in sources.findall('./video_source'):
              source_url = source.text
-            if not source_url:
+            source_name = source.get('name')
+            source_exp = source.get('exp')
+            if not (source_url and source_name and source_exp):
                  continue
-            source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp'))
+            hash_value = xpath_text(hash_values, 'hash_value_' + source_name)
+            if not hash_value:
+                continue
+            source_url = update_url_query(source_url, {
+                'md5': hash_value,
+                'expires': source_exp,
+            })
              f = parse_codecs(source.get('codecs'))
              f.update({
-                'url': source_url,
+                'url': self._proto_relative_url(source_url),
                  'ext': mimetype2ext(source.get('mimetype')) or 'mp4',
                  'format_id': source.get('name'),
                  'width': int_or_none(source.get('width')),
@@ -146,8 +156,7 @@ def _real_extract(self, url):
              formats.append(f)
          self._sort_formats(formats)
  
-        thumbnail = xpath_text(video, './poster_src')
-        duration = int_or_none(xpath_text(video, './duration'))
+        thumbnail = self._proto_relative_url(xpath_text(video, './poster_src'))
  
          age_limit = None
          is_adult = xpath_text(video, './is_adult_content', default=None)
@@ -158,7 +167,7 @@ def _real_extract(self, url):
              'id': video_id,
              'title': title,
              'thumbnail': thumbnail,
-            'duration': duration,
+            'duration': int_or_none(xpath_text(video, './duration')),
              'age_limit': age_limit,
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/videomore.py b/youtube_dlc/extractor/videomore.py

index e3eda3327f48465a307390e23575d8754693737c..e0c10aa5b011b9695ee0150db1f328a9ef80d8b7 100644 (file)
--- a/youtube_dlc/extractor/videomore.py
+++ b/youtube_dlc/extractor/videomore.py
@@ -4,30 +4,50 @@
  import re
  
  from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_urlparse,
+)
  from ..utils import (
+    ExtractorError,
      int_or_none,
-    orderedSet,
-    parse_duration,
-    str_or_none,
-    unified_strdate,
-    url_or_none,
-    xpath_element,
-    xpath_text,
  )
  
  
+class VideomoreBaseIE(InfoExtractor):
+    _API_BASE_URL = 'https://more.tv/api/v3/web/'
+    _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/'
+
+    def _download_page_data(self, display_id):
+        return self._download_json(
+            self._API_BASE_URL + 'PageData', display_id, query={
+                'url': '/' + display_id,
+            })['attributes']['response']['data']
+
+    def _track_url_result(self, track):
+        track_vod = track['trackVod']
+        video_url = track_vod.get('playerLink') or track_vod['link']
+        return self.url_result(
+            video_url, VideomoreIE.ie_key(), track_vod.get('hubId'))
+
+
  class VideomoreIE(InfoExtractor):
      IE_NAME = 'videomore'
      _VALID_URL = r'''(?x)
                      videomore:(?P<sid>\d+)$|
-                    https?://(?:player\.)?videomore\.ru/
+                    https?://
                          (?:
+                            videomore\.ru/
                              (?:
                                  embed|
                                  [^/]+/[^/]+
                              )/|
-                            [^/]*\?.*?\btrack_id=
+                            (?:
+                                (?:player\.)?videomore\.ru|
+                                siren\.more\.tv/player
+                            )/[^/]*\?.*?\btrack_id=|
+                            odysseus\.more.tv/player/(?P<partner_id>\d+)/
                          )
                          (?P<id>\d+)
                          (?:[/?#&]|\.(?:xml|json)|$)
@@ -47,18 +67,19 @@ class VideomoreIE(InfoExtractor):
              'comment_count': int,
              'age_limit': 16,
          },
+        'skip': 'The video is not available for viewing.',
      }, {
          'url': 'http://videomore.ru/embed/259974',
          'info_dict': {
              'id': '259974',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Молодежка 2 сезон 40 серия',
              'series': 'Молодежка',
+            'season': '2 сезон',
              'episode': '40 серия',
              'thumbnail': r're:^https?://.*\.jpg',
-            'duration': 2809,
+            'duration': 2789,
              'view_count': int,
-            'comment_count': int,
              'age_limit': 16,
          },
          'params': {
@@ -79,6 +100,7 @@ class VideomoreIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'The video is not available for viewing.',
      }, {
          'url': 'http://videomore.ru/elki_3?track_id=364623',
          'only_matching': True,
@@ -100,7 +122,14 @@ class VideomoreIE(InfoExtractor):
      }, {
          'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=',
          'only_matching': True,
+    }, {
+        'url': 'https://odysseus.more.tv/player/1788/352317',
+        'only_matching': True,
+    }, {
+        'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=',
+        'only_matching': True,
      }]
+    _GEO_BYPASS = False
  
      @staticmethod
      def _extract_url(webpage):
@@ -118,46 +147,73 @@ def _extract_url(webpage):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('sid') or mobj.group('id')
-
-        video = self._download_xml(
-            'http://videomore.ru/video/tracks/%s.xml' % video_id,
-            video_id, 'Downloading video XML')
-
-        item = xpath_element(video, './/playlist/item', fatal=True)
-
-        title = xpath_text(
-            item, ('./title', './episode_name'), 'title', fatal=True)
-
-        video_url = xpath_text(item, './video_url', 'video url', fatal=True)
-        formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds')
+        partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97'
+
+        item = self._download_json(
+            'https://siren.more.tv/player/config', video_id, query={
+                'partner_id': partner_id,
+                'track_id': video_id,
+            })['data']['playlist']['items'][0]
+
+        title = item.get('title')
+        series = item.get('project_name')
+        season = item.get('season_name')
+        episode = item.get('episode_name')
+        if not title:
+            title = []
+            for v in (series, season, episode):
+                if v:
+                    title.append(v)
+            title = ' '.join(title)
+
+        streams = item.get('streams') or []
+        for protocol in ('DASH', 'HLS'):
+            stream_url = item.get(protocol.lower() + '_url')
+            if stream_url:
+                streams.append({'protocol': protocol, 'url': stream_url})
+
+        formats = []
+        for stream in streams:
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            protocol = stream.get('protocol')
+            if protocol == 'DASH':
+                formats.extend(self._extract_mpd_formats(
+                    stream_url, video_id, mpd_id='dash', fatal=False))
+            elif protocol == 'HLS':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif protocol == 'MSS':
+                formats.extend(self._extract_ism_formats(
+                    stream_url, video_id, ism_id='mss', fatal=False))
+
+        if not formats:
+            error = item.get('error')
+            if error:
+                if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'):
+                    self.raise_geo_restricted(countries=['RU'])
+                raise ExtractorError(error, expected=True)
          self._sort_formats(formats)
  
-        thumbnail = xpath_text(item, './thumbnail_url')
-        duration = int_or_none(xpath_text(item, './duration'))
-        view_count = int_or_none(xpath_text(item, './views'))
-        comment_count = int_or_none(xpath_text(item, './count_comments'))
-        age_limit = int_or_none(xpath_text(item, './min_age'))
-
-        series = xpath_text(item, './project_name')
-        episode = xpath_text(item, './episode_name')
-
          return {
              'id': video_id,
              'title': title,
              'series': series,
+            'season': season,
              'episode': episode,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'view_count': view_count,
-            'comment_count': comment_count,
-            'age_limit': age_limit,
+            'thumbnail': item.get('thumbnail_url'),
+            'duration': int_or_none(item.get('duration')),
+            'view_count': int_or_none(item.get('views')),
+            'age_limit': int_or_none(item.get('min_age')),
              'formats': formats,
          }
  
  
-class VideomoreVideoIE(InfoExtractor):
+class VideomoreVideoIE(VideomoreBaseIE):
      IE_NAME = 'videomore:video'
-    _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$'
+    _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$'
      _TESTS = [{
          # single video with og:video:iframe
          'url': 'http://videomore.ru/elki_3',
@@ -174,10 +230,25 @@ class VideomoreVideoIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'Requires logging in',
      }, {
          # season single series with og:video:iframe
          'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya',
-        'only_matching': True,
+        'info_dict': {
+            'id': '352317',
+            'ext': 'mp4',
+            'title': 'Последний мент 1 сезон 14 серия',
+            'series': 'Последний мент',
+            'season': '1 сезон',
+            'episode': '14 серия',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2464,
+            'age_limit': 16,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
      }, {
          'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk',
          'only_matching': True,
@@ -197,9 +268,13 @@ class VideomoreVideoIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'redirects to https://more.tv/'
      }, {
          'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so',
          'only_matching': True,
+    }, {
+        'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -208,38 +283,25 @@ def suitable(cls, url):
  
      def _real_extract(self, url):
          display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        video_url = self._og_search_property(
-            'video:iframe', webpage, 'video url', default=None)
-
-        if not video_url:
-            video_id = self._search_regex(
-                (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml',
-                 r'track-id=["\'](\d+)',
-                 r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id')
-            video_url = 'videomore:%s' % video_id
-        else:
-            video_id = None
-
-        return self.url_result(
-            video_url, ie=VideomoreIE.ie_key(), video_id=video_id)
+        return self._track_url_result(self._download_page_data(display_id))
  
  
-class VideomoreSeasonIE(InfoExtractor):
+class VideomoreSeasonIE(VideomoreBaseIE):
      IE_NAME = 'videomore:season'
-    _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$'
+    _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$'
      _TESTS = [{
-        'url': 'http://videomore.ru/molodezhka/sezon_promo',
+        'url': 'http://videomore.ru/molodezhka/film_o_filme',
          'info_dict': {
-            'id': 'molodezhka/sezon_promo',
-            'title': 'Ð\9cÐ¾Ð»Ð¾Ð´ÐµÐ¶ÐºÐ° Ð\9fÑ\80Ð¾Ð¼Ð¾',
+            'id': 'molodezhka/film_o_filme',
+            'title': 'Ð¤Ð¸Ð»Ñ\8cÐ¼ Ð¾ Ñ\84Ð¸Ð»Ñ\8cÐ¼Ðµ',
          },
-        'playlist_mincount': 12,
+        'playlist_mincount': 3,
      }, {
          'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so',
          'only_matching': True,
+    }, {
+        'url': 'https://more.tv/molodezhka/film_o_filme',
+        'only_matching': True,
      }]
  
      @classmethod
@@ -249,59 +311,12 @@ def suitable(cls, url):
  
      def _real_extract(self, url):
          display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        title = self._og_search_title(webpage)
-
-        data = self._parse_json(
-            self._html_search_regex(
-                r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1',
-                webpage, 'data', default='{}', group='value'),
-            display_id, fatal=False)
-
+        season = self._download_page_data(display_id)
+        season_id = compat_str(season['id'])
+        tracks = self._download_json(
+            self._API_BASE_URL + 'seasons/%s/tracks' % season_id,
+            season_id)['data']
          entries = []
-
-        if data:
-            episodes = data.get('episodes')
-            if isinstance(episodes, list):
-                for ep in episodes:
-                    if not isinstance(ep, dict):
-                        continue
-                    ep_id = int_or_none(ep.get('id'))
-                    ep_url = url_or_none(ep.get('url'))
-                    if ep_id:
-                        e = {
-                            'url': 'videomore:%s' % ep_id,
-                            'id': compat_str(ep_id),
-                        }
-                    elif ep_url:
-                        e = {'url': ep_url}
-                    else:
-                        continue
-                    e.update({
-                        '_type': 'url',
-                        'ie_key': VideomoreIE.ie_key(),
-                        'title': str_or_none(ep.get('title')),
-                        'thumbnail': url_or_none(ep.get('image')),
-                        'duration': parse_duration(ep.get('duration')),
-                        'episode_number': int_or_none(ep.get('number')),
-                        'upload_date': unified_strdate(ep.get('date')),
-                    })
-                    entries.append(e)
-
-        if not entries:
-            entries = [
-                self.url_result(
-                    'videomore:%s' % video_id, ie=VideomoreIE.ie_key(),
-                    video_id=video_id)
-                for video_id in orderedSet(re.findall(
-                    r':(?:id|key)=["\'](\d+)["\']', webpage))]
-
-        if not entries:
-            entries = [
-                self.url_result(item) for item in re.findall(
-                    r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
-                    % display_id, webpage)]
-
-        return self.playlist_result(entries, display_id, title)
+        for track in tracks:
+            entries.append(self._track_url_result(track))
+        return self.playlist_result(entries, display_id, season.get('title'))
diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py

index 09da4338d9199d5ffaf122719227a8dda37c005e..fd1c305b1ee81b3b5b181e564a194d4a72dd8814 100644 (file)
--- a/youtube_dlc/extractor/viki.py
+++ b/youtube_dlc/extractor/viki.py
@@ -63,14 +63,14 @@ def _prepare_call(self, path, timestamp=None, post_data=None):
  
      def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
          resp = self._download_json(
-            self._prepare_call(path, timestamp, post_data), video_id, note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+            self._prepare_call(path, timestamp, post_data), video_id, note)
  
          error = resp.get('error')
          if error:
              if error == 'invalid timestamp':
                  resp = self._download_json(
                      self._prepare_call(path, int(resp['current_timestamp']), post_data),
-                    video_id, '%s (retry)' % note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+                    video_id, '%s (retry)' % note)
                  error = resp.get('error')
              if error:
                  self._raise_error(resp['error'])
@@ -263,7 +263,7 @@ def _real_extract(self, url):
              # New way to fetch subtitles
              new_video = self._download_json(
                  'https://www.viki.com/api/videos/%s' % video_id, video_id,
-                'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+                'Downloading new video JSON to get subtitles', fatal=False)
              for sub in new_video.get('streamSubtitles').get('dash'):
                  subtitles[sub.get('srclang')] = [{
                      'ext': 'vtt',
diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py

index 2fc42bbae746f08601cbdaa093891b29a6354aeb..299d99f6f9767d2508e096ee4edef34c4eaeb1dc 100644 (file)
--- a/youtube_dlc/extractor/vimeo.py
+++ b/youtube_dlc/extractor/vimeo.py
@@ -1120,6 +1120,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor):
      IE_NAME = 'vhx:embed'
      _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)'
  
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage)
+        return unescapeHTML(mobj.group(1)) if mobj else None
+
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
@@ -1128,5 +1134,6 @@ def _real_extract(self, url):
              'ott data'), video_id, js_to_json)['config_url']
          config = self._download_json(config_url, video_id)
          info = self._parse_config(config, video_id)
+        info['id'] = video_id
          self._vimeo_sort_formats(info['formats'])
          return info
diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py

index c07550810b6ecc0c2096471cbe2a0786495b609e..96b4f665ed50519dadd0b97d6ccdf6801f77fea0 100644 (file)
--- a/youtube_dlc/extractor/vlive.py
+++ b/youtube_dlc/extractor/vlive.py
@@ -155,6 +155,7 @@ def get_common_fields():
                      'old/v3/live/%s/playInfo',
                      video_id)['result']['adaptiveStreamUrl']
                  formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4')
+                self._sort_formats(formats)
                  info = get_common_fields()
                  info.update({
                      'title': self._live_title(video['title']),
diff --git a/youtube_dlc/extractor/vvvvid.py b/youtube_dlc/extractor/vvvvid.py

index 6906cd2aba72b018f850e5d6d377f4c93950d1fb..f4cae7fe9c45e9670459ea16ad0a26662bee3efb 100644 (file)
--- a/youtube_dlc/extractor/vvvvid.py
+++ b/youtube_dlc/extractor/vvvvid.py
@@ -12,7 +12,8 @@
  
  
  class VVVVIDIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)'
+    _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/'
+    _VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE
      _TESTS = [{
          # video_type == 'video/vvvvid'
          'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong',
@@ -21,6 +22,15 @@ class VVVVIDIE(InfoExtractor):
              'id': '489048',
              'ext': 'mp4',
              'title': 'Ping Pong',
+            'duration': 239,
+            'series': '"Perché dovrei guardarlo?" di Dario Moccia',
+            'season_id': '437',
+            'episode': 'Ping Pong',
+            'episode_number': 1,
+            'episode_id': '3334',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
          },
          'params': {
              'skip_download': True,
@@ -37,6 +47,9 @@ class VVVVIDIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+    }, {
+        'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048',
+        'only_matching': True
      }]
      _conn_id = None
  
@@ -45,20 +58,35 @@ def _real_initialize(self):
              'https://www.vvvvid.it/user/login',
              None, headers=self.geo_verification_headers())['data']['conn_id']
  
-    def _real_extract(self, url):
-        show_id, season_id, video_id = re.match(self._VALID_URL, url).groups()
+    def _download_info(self, show_id, path, video_id, fatal=True):
          response = self._download_json(
-            'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id),
+            'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path),
              video_id, headers=self.geo_verification_headers(), query={
                  'conn_id': self._conn_id,
-            })
-        if response['result'] == 'error':
+            }, fatal=fatal)
+        if not (response or fatal):
+            return
+        if response.get('result') == 'error':
              raise ExtractorError('%s said: %s' % (
                  self.IE_NAME, response['message']), expected=True)
+        return response['data']
+
+    def _extract_common_video_info(self, video_data):
+        return {
+            'thumbnail': video_data.get('thumbnail'),
+            'episode_id': str_or_none(video_data.get('id')),
+        }
+
+    def _real_extract(self, url):
+        show_id, season_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        response = self._download_info(
+            show_id, 'season/%s' % season_id, video_id)
  
          vid = int(video_id)
          video_data = list(filter(
-            lambda episode: episode.get('video_id') == vid, response['data']))[0]
+            lambda episode: episode.get('video_id') == vid, response))[0]
+        title = video_data['title']
          formats = []
  
          # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js
@@ -115,6 +143,17 @@ def f(m):
  
              return d
  
+        info = {}
+
+        def metadata_from_url(r_url):
+            if not info and r_url:
+                mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url)
+                if mobj:
+                    info['episode_number'] = int(mobj.group(2))
+                    season_number = mobj.group(1)
+                    if season_number:
+                        info['season_number'] = int(season_number)
+
          for quality in ('_sd', ''):
              embed_code = video_data.get('embed_info' + quality)
              if not embed_code:
@@ -122,7 +161,6 @@ def f(m):
              embed_code = ds(embed_code)
              video_type = video_data.get('video_type')
              if video_type in ('video/rcs', 'video/kenc'):
-                embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8')
                  if video_type == 'video/kenc':
                      kenc = self._download_json(
                          'https://www.vvvvid.it/kenc', video_id, query={
@@ -133,26 +171,75 @@ def f(m):
                      kenc_message = kenc.get('message')
                      if kenc_message:
                          embed_code += '?' + ds(kenc_message)
-                formats.extend(self._extract_m3u8_formats(
-                    embed_code, video_id, 'mp4',
-                    m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_akamai_formats(embed_code, video_id))
              else:
                  formats.extend(self._extract_wowza_formats(
                      'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id))
+            metadata_from_url(embed_code)
+
          self._sort_formats(formats)
  
-        return {
+        metadata_from_url(video_data.get('thumbnail'))
+        info.update(self._extract_common_video_info(video_data))
+        info.update({
              'id': video_id,
-            'title': video_data['title'],
+            'title': title,
              'formats': formats,
-            'thumbnail': video_data.get('thumbnail'),
              'duration': int_or_none(video_data.get('length')),
              'series': video_data.get('show_title'),
              'season_id': season_id,
-            'season_number': video_data.get('season_number'),
-            'episode_id': str_or_none(video_data.get('id')),
-            'episode_number': int_or_none(video_data.get('number')),
-            'episode_title': video_data['title'],
+            'episode': title,
              'view_count': int_or_none(video_data.get('views')),
              'like_count': int_or_none(video_data.get('video_likes')),
-        }
+            'repost_count': int_or_none(video_data.get('video_shares')),
+        })
+        return info
+
+
+class VVVVIDShowIE(VVVVIDIE):
+    _VALID_URL = r'(?P<base_url>%s(?P<id>\d+)(?:/(?P<show_title>[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'https://www.vvvvid.it/show/156/psyco-pass',
+        'info_dict': {
+            'id': '156',
+            'title': 'Psycho-Pass',
+            'description': 'md5:94d572c0bd85894b193b8aebc9a3a806',
+        },
+        'playlist_count': 46,
+    }, {
+        'url': 'https://www.vvvvid.it/show/156',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        base_url, show_id, show_title = re.match(self._VALID_URL, url).groups()
+
+        seasons = self._download_info(
+            show_id, 'seasons/', show_title)
+
+        show_info = self._download_info(
+            show_id, 'info/', show_title, fatal=False)
+
+        entries = []
+        for season in (seasons or []):
+            episodes = season.get('episodes') or []
+            for episode in episodes:
+                if episode.get('playable') is False:
+                    continue
+                season_id = str_or_none(episode.get('season_id'))
+                video_id = str_or_none(episode.get('video_id'))
+                if not (season_id and video_id):
+                    continue
+                info = self._extract_common_video_info(episode)
+                info.update({
+                    '_type': 'url',
+                    'ie_key': VVVVIDIE.ie_key(),
+                    'url': '/'.join([base_url, season_id, video_id]),
+                    'title': episode.get('title'),
+                    'description': episode.get('description'),
+                    'season_id': season_id,
+                })
+                entries.append(info)
+
+        return self.playlist_result(
+            entries, show_id, show_info.get('title'), show_info.get('description'))
diff --git a/youtube_dlc/extractor/washingtonpost.py b/youtube_dlc/extractor/washingtonpost.py

index 625d0a1cc14a52604f46e264a0f93342056fd9df..8afb1af831aec44831d07d61adae269d796f10c9 100644 (file)
--- a/youtube_dlc/extractor/washingtonpost.py
+++ b/youtube_dlc/extractor/washingtonpost.py
@@ -4,17 +4,13 @@
  import re
  
  from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    strip_jsonp,
-)
  
  
  class WashingtonPostIE(InfoExtractor):
      IE_NAME = 'washingtonpost'
-    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
      _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
-    _TEST = {
+    _TESTS = [{
          'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
          'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
          'info_dict': {
@@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor):
              'title': 'Egypt finds belongings, debris from plane crash',
              'description': 'md5:a17ceee432f215a5371388c1f680bd86',
              'upload_date': '20160520',
-            'uploader': 'Reuters',
-            'timestamp': 1463778452,
+            'timestamp': 1463775187,
          },
-    }
+    }, {
+        'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html',
+        'only_matching': True,
+    }]
  
      @classmethod
      def _extract_urls(cls, webpage):
@@ -35,73 +36,8 @@ def _extract_urls(cls, webpage):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        video_data = self._download_json(
-            'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
-            video_id, transform_source=strip_jsonp)[0]['contentConfig']
-        title = video_data['title']
-
-        urls = []
-        formats = []
-        for s in video_data.get('streams', []):
-            s_url = s.get('url')
-            if not s_url or s_url in urls:
-                continue
-            urls.append(s_url)
-            video_type = s.get('type')
-            if video_type == 'smil':
-                continue
-            elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
-                m3u8_formats = self._extract_m3u8_formats(
-                    s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
-                for m3u8_format in m3u8_formats:
-                    width = m3u8_format.get('width')
-                    if not width:
-                        continue
-                    vbr = self._search_regex(
-                        r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
-                    if vbr:
-                        m3u8_format.update({
-                            'vbr': int_or_none(vbr),
-                        })
-                formats.extend(m3u8_formats)
-            else:
-                width = int_or_none(s.get('width'))
-                vbr = int_or_none(s.get('bitrate'))
-                has_width = width != 0
-                formats.append({
-                    'format_id': (
-                        '%s-%d-%d' % (video_type, width, vbr)
-                        if width
-                        else video_type),
-                    'vbr': vbr if has_width else None,
-                    'width': width,
-                    'height': int_or_none(s.get('height')),
-                    'acodec': s.get('audioCodec'),
-                    'vcodec': s.get('videoCodec') if has_width else 'none',
-                    'filesize': int_or_none(s.get('fileSize')),
-                    'url': s_url,
-                    'ext': 'mp4',
-                    'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
-                })
-        source_media_url = video_data.get('sourceMediaURL')
-        if source_media_url:
-            formats.append({
-                'format_id': 'source_media',
-                'url': source_media_url,
-            })
-        self._sort_formats(
-            formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': video_data.get('blurb'),
-            'uploader': video_data.get('credits', {}).get('source'),
-            'formats': formats,
-            'duration': int_or_none(video_data.get('videoDuration'), 100),
-            'timestamp': int_or_none(
-                video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
-        }
+        return self.url_result(
+            'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id)
  
  
  class WashingtonPostArticleIE(InfoExtractor):
@@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'title': 'Breaking Points: The Paper Mine',
                  'duration': 1290,
                  'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
-                'uploader': 'The Washington Post',
-                'timestamp': 1395527908,
-                'upload_date': '20140322',
+                'timestamp': 1395440416,
+                'upload_date': '20140321',
              },
          }, {
              'md5': '1fff6a689d8770966df78c8cb6c8c17c',
@@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'title': 'The town bureaucracy sustains',
                  'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
                  'duration': 2220,
-                'timestamp': 1395528005,
-                'upload_date': '20140322',
-                'uploader': 'The Washington Post',
+                'timestamp': 1395441819,
+                'upload_date': '20140321',
              },
          }],
      }, {
@@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor):
                  'ext': 'mp4',
                  'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.',
                  'upload_date': '20141230',
-                'uploader': 'The Washington Post',
-                'timestamp': 1419974765,
+                'timestamp': 1419972442,
                  'title': 'Why black boxes don’t transmit data in real time',
              }
          }]
diff --git a/youtube_dlc/extractor/wdr.py b/youtube_dlc/extractor/wdr.py

index 44d4a13cac006ac650c6a07b1a604729c25c5205..5cb5924f8041ac13fc97ee8f9f736617f6a627eb 100644 (file)
--- a/youtube_dlc/extractor/wdr.py
+++ b/youtube_dlc/extractor/wdr.py
@@ -17,6 +17,7 @@
      unified_strdate,
      update_url_query,
      urlhandle_detect_ext,
+    url_or_none,
  )
  
  
@@ -42,15 +43,15 @@ def _real_extract(self, url):
          is_live = metadata.get('mediaType') == 'live'
  
          tracker_data = metadata['trackerData']
+        title = tracker_data['trackerClipTitle']
          media_resource = metadata['mediaResource']
  
          formats = []
-        subtitles = {}
  
          # check if the metadata contains a direct URL to a file
-        for kind, media_resource in media_resource.items():
+        for kind, media in media_resource.items():
              if kind == 'captionsHash':
-                for ext, url in media_resource.items():
+                for ext, url in media.items():
                      subtitles.setdefault('de', []).append({
                          'url': url,
                          'ext': ext,
@@ -59,8 +60,10 @@ def _real_extract(self, url):
  
              if kind not in ('dflt', 'alt'):
                  continue
+            if not isinstance(media, dict):
+                continue
  
-            for tag_name, medium_url in media_resource.items():
+            for tag_name, medium_url in media.items():
                  if tag_name not in ('videoURL', 'audioURL'):
                      continue
  
@@ -90,7 +93,23 @@ def _real_extract(self, url):
  
          self._sort_formats(formats)
  
-        title = tracker_data['trackerClipTitle']
+        subtitles = {}
+        caption_url = media_resource.get('captionURL')
+        if caption_url:
+            subtitles['de'] = [{
+                'url': caption_url,
+                'ext': 'ttml',
+            }]
+        captions_hash = media_resource.get('captionsHash')
+        if isinstance(captions_hash, dict):
+            for ext, format_url in captions_hash.items():
+                format_url = url_or_none(format_url)
+                if not format_url:
+                    continue
+                subtitles.setdefault('de', []).append({
+                    'url': format_url,
+                    'ext': determine_ext(format_url, None) or ext,
+                })
  
          return {
              'id': tracker_data.get('trackerClipId', video_id),
@@ -106,7 +125,7 @@ def _real_extract(self, url):
  class WDRPageIE(InfoExtractor):
      _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
      _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
-    _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
+    _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
  
      _TESTS = [
          {
@@ -213,7 +232,11 @@ class WDRPageIE(InfoExtractor):
          {
              'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
              'only_matching': True,
-        }
+        },
+        {
+            'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html',
+            'only_matching': True,
+        },
      ]
  
      def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/wistia.py b/youtube_dlc/extractor/wistia.py

index 77febd2eb1b1cada3942c212739725135a36682b..ae32a0a68562e382dbef882b9988ebb818f03b51 100644 (file)
--- a/youtube_dlc/extractor/wistia.py
+++ b/youtube_dlc/extractor/wistia.py
@@ -5,79 +5,34 @@
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
-    int_or_none,
      float_or_none,
+    int_or_none,
+    try_get,
      unescapeHTML,
  )
  
  
-class WistiaIE(InfoExtractor):
-    _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]{10})'
+class WistiaBaseIE(InfoExtractor):
+    _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})'
+    _VALID_URL_BASE = r'https?://(?:fast\.)?wistia\.(?:net|com)/embed/'
      _EMBED_BASE_URL = 'http://fast.wistia.com/embed/'
  
-    _TESTS = [{
-        'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
-        'md5': 'cafeb56ec0c53c18c97405eecb3133df',
-        'info_dict': {
-            'id': 'sh7fpupwlt',
-            'ext': 'mov',
-            'title': 'Being Resourceful',
-            'description': 'a Clients From Hell Video Series video from worldwidewebhosting',
-            'upload_date': '20131204',
-            'timestamp': 1386185018,
-            'duration': 117,
-        },
-    }, {
-        'url': 'wistia:sh7fpupwlt',
-        'only_matching': True,
-    }, {
-        # with hls video
-        'url': 'wistia:807fafadvk',
-        'only_matching': True,
-    }, {
-        'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
-        'only_matching': True,
-    }, {
-        'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
-        'only_matching': True,
-    }]
-
-    # https://wistia.com/support/embed-and-share/video-on-your-website
-    @staticmethod
-    def _extract_url(webpage):
-        urls = WistiaIE._extract_urls(webpage)
-        return urls[0] if urls else None
-
-    @staticmethod
-    def _extract_urls(webpage):
-        urls = []
-        for match in re.finditer(
-                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
-            urls.append(unescapeHTML(match.group('url')))
-        for match in re.finditer(
-                r'''(?sx)
-                    <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
-                ''', webpage):
-            urls.append('wistia:%s' % match.group('id'))
-        for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage):
-            urls.append('wistia:%s' % match.group('id'))
-        return urls
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        data_json = self._download_json(
-            self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id,
-            # Some videos require this.
-            headers={
-                'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id,
+    def _download_embed_config(self, config_type, config_id, referer):
+        base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id)
+        embed_config = self._download_json(
+            base_url + '.json', config_id, headers={
+                'Referer': referer if referer.startswith('http') else base_url,  # Some videos require this.
              })
  
-        if data_json.get('error'):
+        if isinstance(embed_config, dict) and embed_config.get('error'):
              raise ExtractorError(
                  'Error while getting the playlist', expected=True)
  
-        data = data_json['media']
+        return embed_config
+
+    def _extract_media(self, embed_config):
+        data = embed_config['media']
+        video_id = data['hashedId']
          title = data['name']
  
          formats = []
@@ -160,3 +115,85 @@ def _real_extract(self, url):
              'timestamp': int_or_none(data.get('createdAt')),
              'subtitles': subtitles,
          }
+
+
+class WistiaIE(WistiaBaseIE):
+    _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX)
+
+    _TESTS = [{
+        # with hls video
+        'url': 'wistia:807fafadvk',
+        'md5': 'daff0f3687a41d9a71b40e0e8c2610fe',
+        'info_dict': {
+            'id': '807fafadvk',
+            'ext': 'mp4',
+            'title': 'Drip Brennan Dunn Workshop',
+            'description': 'a JV Webinars video',
+            'upload_date': '20160518',
+            'timestamp': 1463607249,
+            'duration': 4987.11,
+        },
+    }, {
+        'url': 'wistia:sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
+        'only_matching': True,
+    }]
+
+    # https://wistia.com/support/embed-and-share/video-on-your-website
+    @staticmethod
+    def _extract_url(webpage):
+        urls = WistiaIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        urls = []
+        for match in re.finditer(
+                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
+            urls.append(unescapeHTML(match.group('url')))
+        for match in re.finditer(
+                r'''(?sx)
+                    <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
+                ''', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        return urls
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        embed_config = self._download_embed_config('media', video_id, url)
+        return self._extract_media(embed_config)
+
+
+class WistiaPlaylistIE(WistiaBaseIE):
+    _VALID_URL = r'%splaylists/%s' % (WistiaIE._VALID_URL_BASE, WistiaIE._VALID_ID_REGEX)
+
+    _TEST = {
+        'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc',
+        'info_dict': {
+            'id': 'aodt9etokc',
+        },
+        'playlist_count': 3,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        playlist = self._download_embed_config('playlist', playlist_id, url)
+
+        entries = []
+        for media in (try_get(playlist, lambda x: x[0]['medias']) or []):
+            embed_config = media.get('embed_config')
+            if not embed_config:
+                continue
+            entries.append(self._extract_media(embed_config))
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dlc/extractor/yandexdisk.py b/youtube_dlc/extractor/yandexdisk.py

index e8f6ae10f97470932f31afd6f5c92e9c802c28aa..6fcd8ee7e9b779e0e341ad6285226cabc2cbdca8 100644 (file)
--- a/youtube_dlc/extractor/yandexdisk.py
+++ b/youtube_dlc/extractor/yandexdisk.py
@@ -1,23 +1,43 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import json
+import re
+
  from .common import InfoExtractor
-from ..compat import compat_str
  from ..utils import (
      determine_ext,
      float_or_none,
      int_or_none,
+    mimetype2ext,
      try_get,
-    urlencode_postdata,
+    urljoin,
  )
  
  
  class YandexDiskIE(InfoExtractor):
-    _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'''(?x)https?://
+        (?P<domain>
+            yadi\.sk|
+            disk\.yandex\.
+                (?:
+                    az|
+                    by|
+                    co(?:m(?:\.(?:am|ge|tr))?|\.il)|
+                    ee|
+                    fr|
+                    k[gz]|
+                    l[tv]|
+                    md|
+                    t[jm]|
+                    u[az]|
+                    ru
+                )
+        )/(?:[di]/|public.*?\bhash=)(?P<id>[^/?#&]+)'''
  
      _TESTS = [{
          'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
-        'md5': '33955d7ae052f15853dc41f35f17581c',
+        'md5': 'a4a8d52958c8fddcf9845935070402ae',
          'info_dict': {
              'id': 'VdOeDou8eZs6Y',
              'ext': 'mp4',
@@ -27,92 +47,101 @@ class YandexDiskIE(InfoExtractor):
              'uploader_id': '300043621',
              'view_count': int,
          },
+        'expected_warnings': ['Unable to download JSON metadata'],
      }, {
          'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
          'only_matching': True,
+    }, {
+        'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        status = self._download_webpage(
-            'https://disk.yandex.com/auth/status', video_id, query={
-                'urlOrigin': url,
-                'source': 'public',
-                'md5': 'false',
-            })
-
-        sk = self._search_regex(
-            r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2',
-            status, 'sk', group='value')
+        domain, video_id = re.match(self._VALID_URL, url).groups()
  
          webpage = self._download_webpage(url, video_id)
-
-        models = self._parse_json(
-            self._search_regex(
-                r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script',
-                webpage, 'video JSON'),
-            video_id)
-
-        data = next(
-            model['data'] for model in models
-            if model.get('model') == 'resource')
-
-        video_hash = data['id']
-        title = data['name']
-
-        models = self._download_json(
-            'https://disk.yandex.com/models/', video_id,
-            data=urlencode_postdata({
-                '_model.0': 'videoInfo',
-                'id.0': video_hash,
-                '_model.1': 'do-get-resource-url',
-                'id.1': video_hash,
-                'version': '13.6',
-                'sk': sk,
-            }), query={'_m': 'videoInfo'})['models']
-
-        videos = try_get(models, lambda x: x[0]['data']['videos'], list) or []
-        source_url = try_get(
-            models, lambda x: x[1]['data']['file'], compat_str)
+        store = self._parse_json(self._search_regex(
+            r'<script[^>]+id="store-prefetch"[^>]*>\s*({.+?})\s*</script>',
+            webpage, 'store'), video_id)
+        resource = store['resources'][store['rootResourceId']]
+
+        title = resource['name']
+        meta = resource.get('meta') or {}
+
+        public_url = meta.get('short_url')
+        if public_url:
+            video_id = self._match_id(public_url)
+
+        source_url = (self._download_json(
+            'https://cloud-api.yandex.net/v1/disk/public/resources/download',
+            video_id, query={'public_key': url}, fatal=False) or {}).get('href')
+        video_streams = resource.get('videoStreams') or {}
+        video_hash = resource.get('hash') or url
+        environment = store.get('environment') or {}
+        sk = environment.get('sk')
+        yandexuid = environment.get('yandexuid')
+        if sk and yandexuid and not (source_url and video_streams):
+            self._set_cookie(domain, 'yandexuid', yandexuid)
+
+            def call_api(action):
+                return (self._download_json(
+                    urljoin(url, '/public/api/') + action, video_id, data=json.dumps({
+                        'hash': video_hash,
+                        'sk': sk,
+                    }).encode(), headers={
+                        'Content-Type': 'text/plain',
+                    }, fatal=False) or {}).get('data') or {}
+            if not source_url:
+                # TODO: figure out how to detect if download limit has
+                # been reached and then avoid unnecessary source format
+                # extraction requests
+                source_url = call_api('download-url').get('url')
+            if not video_streams:
+                video_streams = call_api('get-video-streams')
  
          formats = []
          if source_url:
              formats.append({
                  'url': source_url,
                  'format_id': 'source',
-                'ext': determine_ext(title, 'mp4'),
+                'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'),
                  'quality': 1,
+                'filesize': int_or_none(meta.get('size'))
              })
-        for video in videos:
+
+        for video in (video_streams.get('videos') or []):
              format_url = video.get('url')
              if not format_url:
                  continue
-            if determine_ext(format_url) == 'm3u8':
+            if video.get('dimension') == 'adaptive':
                  formats.extend(self._extract_m3u8_formats(
-                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    format_url, video_id, 'mp4', 'm3u8_native',
                      m3u8_id='hls', fatal=False))
              else:
+                size = video.get('size') or {}
+                height = int_or_none(size.get('height'))
+                format_id = 'hls'
+                if height:
+                    format_id += '-%dp' % height
                  formats.append({
+                    'ext': 'mp4',
+                    'format_id': format_id,
+                    'height': height,
+                    'protocol': 'm3u8_native',
                      'url': format_url,
+                    'width': int_or_none(size.get('width')),
                  })
          self._sort_formats(formats)
  
-        duration = float_or_none(try_get(
-            models, lambda x: x[0]['data']['duration']), 1000)
-        uploader = try_get(
-            data, lambda x: x['user']['display_name'], compat_str)
-        uploader_id = try_get(
-            data, lambda x: x['user']['uid'], compat_str)
-        view_count = int_or_none(try_get(
-            data, lambda x: x['meta']['views_counter']))
+        uid = resource.get('uid')
+        display_name = try_get(store, lambda x: x['users'][uid]['displayName'])
  
          return {
              'id': video_id,
              'title': title,
-            'duration': duration,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
-            'view_count': view_count,
+            'duration': float_or_none(video_streams.get('duration'), 1000),
+            'uploader': display_name,
+            'uploader_id': uid,
+            'view_count': int_or_none(meta.get('views_counter')),
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/yandexmusic.py b/youtube_dlc/extractor/yandexmusic.py

index 4358bc83669ffab11e5dab39444ddce1acdab2cd..3cc13bc5ba889c2d56e9d182c4794dd3e5502fea 100644 (file)
--- a/youtube_dlc/extractor/yandexmusic.py
+++ b/youtube_dlc/extractor/yandexmusic.py
@@ -15,6 +15,8 @@
  
  
  class YandexMusicBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by|com)'
+
      @staticmethod
      def _handle_error(response):
          if isinstance(response, dict):
@@ -46,57 +48,72 @@ def _download_json(self, *args, **kwargs):
          self._handle_error(response)
          return response
  
+    def _call_api(self, ep, tld, url, item_id, note, query):
+        return self._download_json(
+            'https://music.yandex.%s/handlers/%s.jsx' % (tld, ep),
+            item_id, note,
+            fatal=False,
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+                'X-Retpath-Y': url,
+            },
+            query=query)
+
  
  class YandexMusicTrackIE(YandexMusicBaseIE):
      IE_NAME = 'yandexmusic:track'
      IE_DESC = 'Яндекс.Музыка - Трек'
-    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
+    _VALID_URL = r'%s/album/(?P<album_id>\d+)/track/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE
  
      _TESTS = [{
          'url': 'http://music.yandex.ru/album/540508/track/4878838',
-        'md5': 'f496818aa2f60b6c0062980d2e00dc20',
+        'md5': 'dec8b661f12027ceaba33318787fff76',
          'info_dict': {
              'id': '4878838',
              'ext': 'mp3',
-            'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1',
-            'filesize': 4628061,
+            'title': 'md5:c63e19341fdbe84e43425a30bc777856',
+            'filesize': int,
              'duration': 193.04,
-            'track': 'Gypsy Eyes 1',
-            'album': 'Gypsy Soul',
-            'album_artist': 'Carlo Ambrosio',
-            'artist': 'Carlo Ambrosio & Fabio Di Bari',
+            'track': 'md5:210508c6ffdfd67a493a6c378f22c3ff',
+            'album': 'md5:cd04fb13c4efeafdfa0a6a6aca36d01a',
+            'album_artist': 'md5:5f54c35462c07952df33d97cfb5fc200',
+            'artist': 'md5:e6fd86621825f14dc0b25db3acd68160',
              'release_year': 2009,
          },
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
      }, {
          # multiple disks
          'url': 'http://music.yandex.ru/album/3840501/track/705105',
-        'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e',
+        'md5': '82a54e9e787301dd45aba093cf6e58c0',
          'info_dict': {
              'id': '705105',
              'ext': 'mp3',
-            'title': 'Hooverphonic - Sometimes',
-            'filesize': 5743386,
+            'title': 'md5:f86d4a9188279860a83000277024c1a6',
+            'filesize': int,
              'duration': 239.27,
-            'track': 'Sometimes',
-            'album': 'The Best of Hooverphonic',
-            'album_artist': 'Hooverphonic',
-            'artist': 'Hooverphonic',
+            'track': 'md5:40f887f0666ba1aa10b835aca44807d1',
+            'album': 'md5:624f5224b14f5c88a8e812fd7fbf1873',
+            'album_artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12',
+            'artist': 'md5:dd35f2af4e8927100cbe6f5e62e1fb12',
              'release_year': 2016,
              'genre': 'pop',
              'disc_number': 2,
              'track_number': 9,
          },
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
+    }, {
+        'url': 'http://music.yandex.com/album/540508/track/4878838',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        album_id, track_id = mobj.group('album_id'), mobj.group('id')
+        tld, album_id, track_id = mobj.group('tld'), mobj.group('album_id'), mobj.group('id')
  
-        track = self._download_json(
-            'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id),
-            track_id, 'Downloading track JSON')['track']
+        track = self._call_api(
+            'track', tld, url, track_id, 'Downloading track JSON',
+            {'track': '%s:%s' % (track_id, album_id)})['track']
          track_title = track['title']
  
          download_data = self._download_json(
@@ -109,8 +126,7 @@ def _real_extract(self, url):
              'Downloading track location JSON',
              query={'format': 'json'})
          key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest()
-        storage = track['storageDir'].split('.')
-        f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1])
+        f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id'])
  
          thumbnail = None
          cover_uri = track.get('albums', [{}])[0].get('coverUri')
@@ -180,46 +196,104 @@ def extract_artist(artist_list):
  
  
  class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
+    def _extract_tracks(self, source, item_id, url, tld):
+        tracks = source['tracks']
+        track_ids = [compat_str(track_id) for track_id in source['trackIds']]
+
+        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
+        # missing tracks should be retrieved manually.
+        if len(tracks) < len(track_ids):
+            present_track_ids = set([
+                compat_str(track['id'])
+                for track in tracks if track.get('id')])
+            missing_track_ids = [
+                track_id for track_id in track_ids
+                if track_id not in present_track_ids]
+            missing_tracks = self._call_api(
+                'track-entries', tld, url, item_id,
+                'Downloading missing tracks JSON', {
+                    'entries': ','.join(missing_track_ids),
+                    'lang': tld,
+                    'external-domain': 'music.yandex.%s' % tld,
+                    'overembed': 'false',
+                    'strict': 'true',
+                })
+            if missing_tracks:
+                tracks.extend(missing_tracks)
+
+        return tracks
+
      def _build_playlist(self, tracks):
-        return [
-            self.url_result(
-                'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id']))
-            for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)]
+        entries = []
+        for track in tracks:
+            track_id = track.get('id') or track.get('realId')
+            if not track_id:
+                continue
+            albums = track.get('albums')
+            if not albums or not isinstance(albums, list):
+                continue
+            album = albums[0]
+            if not isinstance(album, dict):
+                continue
+            album_id = album.get('id')
+            if not album_id:
+                continue
+            entries.append(self.url_result(
+                'http://music.yandex.ru/album/%s/track/%s' % (album_id, track_id),
+                ie=YandexMusicTrackIE.ie_key(), video_id=track_id))
+        return entries
  
  
  class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
      IE_NAME = 'yandexmusic:album'
      IE_DESC = 'Яндекс.Музыка - Альбом'
-    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)'
+    _VALID_URL = r'%s/album/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE
  
      _TESTS = [{
          'url': 'http://music.yandex.ru/album/540508',
          'info_dict': {
              'id': '540508',
-            'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
+            'title': 'md5:7ed1c3567f28d14be9f61179116f5571',
          },
          'playlist_count': 50,
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
      }, {
          'url': 'https://music.yandex.ru/album/3840501',
          'info_dict': {
              'id': '3840501',
-            'title': 'Hooverphonic - The Best of Hooverphonic (2016)',
+            'title': 'md5:36733472cdaa7dcb1fd9473f7da8e50f',
          },
          'playlist_count': 33,
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
+    }, {
+        # empty artists
+        'url': 'https://music.yandex.ru/album/9091882',
+        'info_dict': {
+            'id': '9091882',
+            'title': 'ТЕД на русском',
+        },
+        'playlist_count': 187,
      }]
  
+    @classmethod
+    def suitable(cls, url):
+        return False if YandexMusicTrackIE.suitable(url) else super(YandexMusicAlbumIE, cls).suitable(url)
+
      def _real_extract(self, url):
-        album_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        album_id = mobj.group('id')
  
-        album = self._download_json(
-            'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,
-            album_id, 'Downloading album JSON')
+        album = self._call_api(
+            'album', tld, url, album_id, 'Downloading album JSON',
+            {'album': album_id})
  
          entries = self._build_playlist([track for volume in album['volumes'] for track in volume])
  
-        title = '%s - %s' % (album['artists'][0]['name'], album['title'])
+        title = album['title']
+        artist = try_get(album, lambda x: x['artists'][0]['name'], compat_str)
+        if artist:
+            title = '%s - %s' % (artist, title)
          year = album.get('year')
          if year:
              title += ' (%s)' % year
@@ -230,27 +304,30 @@ def _real_extract(self, url):
  class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
      IE_NAME = 'yandexmusic:playlist'
      IE_DESC = 'Яндекс.Музыка - Плейлист'
-    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
+    _VALID_URL = r'%s/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)' % YandexMusicBaseIE._VALID_URL_BASE
  
      _TESTS = [{
          'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
          'info_dict': {
              'id': '1245',
-            'title': 'Что слушают Enter Shikari',
+            'title': 'md5:841559b3fe2b998eca88d0d2e22a3097',
              'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
          },
-        'playlist_count': 6,
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        'playlist_count': 5,
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
      }, {
-        # playlist exceeding the limit of 150 tracks shipped with webpage (see
-        # https://github.com/ytdl-org/youtube-dl/issues/6666)
          'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
+        'only_matching': True,
+    }, {
+        # playlist exceeding the limit of 150 tracks (see
+        # https://github.com/ytdl-org/youtube-dl/issues/6666)
+        'url': 'https://music.yandex.ru/users/mesiaz/playlists/1364',
          'info_dict': {
-            'id': '1036',
-            'title': 'Музыка 90-х',
+            'id': '1364',
+            'title': 'md5:b3b400f997d3f878a13ae0699653f7db',
          },
-        'playlist_mincount': 300,
-        'skip': 'Travis CI servers blocked by YandexMusic',
+        'playlist_mincount': 437,
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
      }]
  
      def _real_extract(self, url):
@@ -259,16 +336,8 @@ def _real_extract(self, url):
          user = mobj.group('user')
          playlist_id = mobj.group('id')
  
-        playlist = self._download_json(
-            'https://music.yandex.%s/handlers/playlist.jsx' % tld,
-            playlist_id, 'Downloading missing tracks JSON',
-            fatal=False,
-            headers={
-                'Referer': url,
-                'X-Requested-With': 'XMLHttpRequest',
-                'X-Retpath-Y': url,
-            },
-            query={
+        playlist = self._call_api(
+            'playlist', tld, url, playlist_id, 'Downloading playlist JSON', {
                  'owner': user,
                  'kinds': playlist_id,
                  'light': 'true',
@@ -277,37 +346,103 @@ def _real_extract(self, url):
                  'overembed': 'false',
              })['playlist']
  
-        tracks = playlist['tracks']
-        track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
-
-        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
-        # missing tracks should be retrieved manually.
-        if len(tracks) < len(track_ids):
-            present_track_ids = set([
-                compat_str(track['id'])
-                for track in tracks if track.get('id')])
-            missing_track_ids = [
-                track_id for track_id in track_ids
-                if track_id not in present_track_ids]
-            missing_tracks = self._download_json(
-                'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
-                playlist_id, 'Downloading missing tracks JSON',
-                fatal=False,
-                headers={
-                    'Referer': url,
-                    'X-Requested-With': 'XMLHttpRequest',
-                },
-                query={
-                    'entries': ','.join(missing_track_ids),
-                    'lang': tld,
-                    'external-domain': 'music.yandex.%s' % tld,
-                    'overembed': 'false',
-                    'strict': 'true',
-                })
-            if missing_tracks:
-                tracks.extend(missing_tracks)
+        tracks = self._extract_tracks(playlist, playlist_id, url, tld)
  
          return self.playlist_result(
              self._build_playlist(tracks),
              compat_str(playlist_id),
              playlist.get('title'), playlist.get('description'))
+
+
+class YandexMusicArtistBaseIE(YandexMusicPlaylistBaseIE):
+    def _call_artist(self, tld, url, artist_id):
+        return self._call_api(
+            'artist', tld, url, artist_id,
+            'Downloading artist %s JSON' % self._ARTIST_WHAT, {
+                'artist': artist_id,
+                'what': self._ARTIST_WHAT,
+                'sort': self._ARTIST_SORT or '',
+                'dir': '',
+                'period': '',
+                'lang': tld,
+                'external-domain': 'music.yandex.%s' % tld,
+                'overembed': 'false',
+            })
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        artist_id = mobj.group('id')
+        data = self._call_artist(tld, url, artist_id)
+        tracks = self._extract_tracks(data, artist_id, url, tld)
+        title = try_get(data, lambda x: x['artist']['name'], compat_str)
+        return self.playlist_result(
+            self._build_playlist(tracks), artist_id, title)
+
+
+class YandexMusicArtistTracksIE(YandexMusicArtistBaseIE):
+    IE_NAME = 'yandexmusic:artist:tracks'
+    IE_DESC = 'Яндекс.Музыка - Артист - Треки'
+    _VALID_URL = r'%s/artist/(?P<id>\d+)/tracks' % YandexMusicBaseIE._VALID_URL_BASE
+
+    _TESTS = [{
+        'url': 'https://music.yandex.ru/artist/617526/tracks',
+        'info_dict': {
+            'id': '617526',
+            'title': 'md5:131aef29d45fd5a965ca613e708c040b',
+        },
+        'playlist_count': 507,
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
+    }]
+
+    _ARTIST_SORT = ''
+    _ARTIST_WHAT = 'tracks'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        artist_id = mobj.group('id')
+        data = self._call_artist(tld, url, artist_id)
+        tracks = self._extract_tracks(data, artist_id, url, tld)
+        artist = try_get(data, lambda x: x['artist']['name'], compat_str)
+        title = '%s - %s' % (artist or artist_id, 'Треки')
+        return self.playlist_result(
+            self._build_playlist(tracks), artist_id, title)
+
+
+class YandexMusicArtistAlbumsIE(YandexMusicArtistBaseIE):
+    IE_NAME = 'yandexmusic:artist:albums'
+    IE_DESC = 'Яндекс.Музыка - Артист - Альбомы'
+    _VALID_URL = r'%s/artist/(?P<id>\d+)/albums' % YandexMusicBaseIE._VALID_URL_BASE
+
+    _TESTS = [{
+        'url': 'https://music.yandex.ru/artist/617526/albums',
+        'info_dict': {
+            'id': '617526',
+            'title': 'md5:55dc58d5c85699b7fb41ee926700236c',
+        },
+        'playlist_count': 8,
+        # 'skip': 'Travis CI servers blocked by YandexMusic',
+    }]
+
+    _ARTIST_SORT = 'year'
+    _ARTIST_WHAT = 'albums'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        artist_id = mobj.group('id')
+        data = self._call_artist(tld, url, artist_id)
+        entries = []
+        for album in data['albums']:
+            if not isinstance(album, dict):
+                continue
+            album_id = album.get('id')
+            if not album_id:
+                continue
+            entries.append(self.url_result(
+                'http://music.yandex.ru/album/%s' % album_id,
+                ie=YandexMusicAlbumIE.ie_key(), video_id=album_id))
+        artist = try_get(data, lambda x: x['artist']['name'], compat_str)
+        title = '%s - %s' % (artist or artist_id, 'Альбомы')
+        return self.playlist_result(entries, artist_id, title)
diff --git a/youtube_dlc/extractor/yandexvideo.py b/youtube_dlc/extractor/yandexvideo.py

index 46529be05b65ee896af1e50c75d34a5528b74596..6a166ec9b92897578f52e9c5ae20bf078b9e8f05 100644 (file)
--- a/youtube_dlc/extractor/yandexvideo.py
+++ b/youtube_dlc/extractor/yandexvideo.py
@@ -5,6 +5,7 @@
  from ..utils import (
      determine_ext,
      int_or_none,
+    try_get,
      url_or_none,
  )
  
@@ -13,26 +14,30 @@ class YandexVideoIE(InfoExtractor):
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
-                            yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=|
+                            yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=|
                              frontend\.vh\.yandex\.ru/player/
                          )
-                        (?P<id>[\da-f]+)
+                        (?P<id>(?:[\da-f]{32}|[\w-]{12}))
                      '''
      _TESTS = [{
-        'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d',
-        'md5': '33955d7ae052f15853dc41f35f17581c',
+        'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374',
+        'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4',
          'info_dict': {
-            'id': '4dbb262b4fe5cf15a215de4f34eee34d',
+            'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374',
              'ext': 'mp4',
-            'title': 'Ð\92 Ð\9dÑ\8cÑ\8e-Ð\99Ð¾Ñ\80ÐºÐµ Ð±Ð°Ñ\80Ð¶Ð¸ Ð¸ Ñ\82ÐµÐ¿Ð»Ð¾Ñ\85Ð¾Ð´ Ð¾Ñ\82Ð¾Ñ\80Ð²Ð°Ð»Ð¸Ñ\81Ñ\8c Ð¾Ñ\82 Ð¿Ñ\80Ð¸Ñ\87Ð°Ð»Ð° Ð¸ Ñ\80Ð°Ñ\81Ð¿Ð»Ñ\8bÐ»Ð¸Ñ\81Ñ\8c Ð¿Ð¾ Ð\93Ñ\83Ð´Ð·Ð¾Ð½Ñ\83',
-            'description': '',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'timestamp': 0,
-            'duration': 30,
+            'title': 'Ð Ñ\83Ñ\81Ñ\81ÐºÐ¸Ð¹ Ð\92Ñ\83Ð´Ñ\81Ñ\82Ð¾Ðº - Ð³Ð»Ð°Ð²Ð½Ñ\8bÐ¹ Ñ\80Ð¾Ðº-Ñ\84ÐµÑ\81Ñ\82 Ð² Ð¸Ñ\81Ñ\82Ð¾Ñ\80Ð¸Ð¸ Ð¡Ð¡Ð¡Ð  / Ð²Ð\94Ñ\83Ð´Ñ\8c',
+            'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa',
+            'thumbnail': r're:^https?://',
+            'timestamp': 1549972939,
+            'duration': 5575,
              'age_limit': 18,
+            'upload_date': '20190212',
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
          },
      }, {
-        'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda',
+        'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda',
          'only_matching': True,
      }, {
          'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d',
@@ -52,53 +57,88 @@ class YandexVideoIE(InfoExtractor):
          # DASH with DRM
          'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8',
          'only_matching': True,
+    }, {
+        'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        content = self._download_json(
-            'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id,
-            video_id, query={
-                'stream_options': 'hires',
-                'disable_trackings': 1,
-            })['content']
-
-        content_url = url_or_none(content.get('content_url')) or url_or_none(
-            content['streams'][0]['url'])
-        title = content.get('title') or content.get('computed_title')
+        player = try_get((self._download_json(
+            'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{
+  player(content_id: "%s") {
+    computed_title
+    content_url
+    description
+    dislikes
+    duration
+    likes
+    program_title
+    release_date
+    release_date_ut
+    release_year
+    restriction_age
+    season
+    start_time
+    streams
+    thumbnail
+    title
+    views_count
+  }
+}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content'])
+        if not player or player.get('error'):
+            player = self._download_json(
+                'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id,
+                video_id, query={
+                    'stream_options': 'hires',
+                    'disable_trackings': 1,
+                })
+        content = player['content']
  
-        ext = determine_ext(content_url)
+        title = content.get('title') or content['computed_title']
  
-        if ext == 'm3u8':
-            formats = self._extract_m3u8_formats(
-                content_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                m3u8_id='hls')
-        elif ext == 'mpd':
-            formats = self._extract_mpd_formats(
-                content_url, video_id, mpd_id='dash')
-        else:
-            formats = [{'url': content_url}]
+        formats = []
+        streams = content.get('streams') or []
+        streams.append({'url': content.get('content_url')})
+        for stream in streams:
+            content_url = url_or_none(stream.get('url'))
+            if not content_url:
+                continue
+            ext = determine_ext(content_url)
+            if ext == 'ismc':
+                continue
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    content_url, video_id, 'mp4',
+                    'm3u8_native', m3u8_id='hls', fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    content_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({'url': content_url})
  
          self._sort_formats(formats)
  
-        description = content.get('description')
-        thumbnail = content.get('thumbnail')
          timestamp = (int_or_none(content.get('release_date'))
                       or int_or_none(content.get('release_date_ut'))
                       or int_or_none(content.get('start_time')))
-        duration = int_or_none(content.get('duration'))
-        series = content.get('program_title')
-        age_limit = int_or_none(content.get('restriction_age'))
+        season = content.get('season') or {}
  
          return {
              'id': video_id,
              'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
+            'description': content.get('description'),
+            'thumbnail': content.get('thumbnail'),
              'timestamp': timestamp,
-            'duration': duration,
-            'series': series,
-            'age_limit': age_limit,
+            'duration': int_or_none(content.get('duration')),
+            'series': content.get('program_title'),
+            'age_limit': int_or_none(content.get('restriction_age')),
+            'view_count': int_or_none(content.get('views_count')),
+            'like_count': int_or_none(content.get('likes')),
+            'dislike_count': int_or_none(content.get('dislikes')),
+            'season_number': int_or_none(season.get('season_number')),
+            'season_id': season.get('id'),
+            'release_year': int_or_none(content.get('release_year')),
              'formats': formats,
          }
diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py

index e0f211b741f712b1e1d9b394a54233385e67dc50..c67ecde04202567e6fa6675bc668519d67a9b00e 100644 (file)
--- a/youtube_dlc/extractor/youtube.py
+++ b/youtube_dlc/extractor/youtube.py
@@ -16,6 +16,7 @@
  from ..swfinterp import SWFInterpreter
  from ..compat import (
      compat_chr,
+    compat_HTTPError,
      compat_kwargs,
      compat_parse_qs,
      compat_urllib_parse_unquote,
@@ -64,9 +65,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
      _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
  
      _RESERVED_NAMES = (
-        r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|'
-        r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
-        r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
+        r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
+        r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
+        r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
  
      _NETRC_MACHINE = 'youtube'
      # If True it will raise an error if no login info is provided
@@ -74,11 +75,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
  
      _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
  
-    _YOUTUBE_CLIENT_HEADERS = {
-        'x-youtube-client-name': '1',
-        'x-youtube-client-version': '1.20200609.04.02',
-    }
-
      def _set_language(self):
          self._set_cookie(
              '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
@@ -307,6 +303,8 @@ def _real_initialize(self):
      }
  
      _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
+    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
+    _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
  
      def _call_api(self, ep, query, video_id):
          data = self._DEFAULT_API_DATA.copy()
@@ -324,10 +322,16 @@ def _call_api(self, ep, query, video_id):
      def _extract_yt_initial_data(self, video_id, webpage):
          return self._parse_json(
              self._search_regex(
-                (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
+                (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
                   self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
              video_id)
  
+    def _extract_ytcfg(self, video_id, webpage):
+        return self._parse_json(
+            self._search_regex(
+                r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
+                default='{}'), video_id, fatal=False)
+
  
  class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = 'YouTube.com'
@@ -343,14 +347,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                              # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
                              (?:(?:www|dev)\.)?invidio\.us/|
                              (?:(?:www|no)\.)?invidiou\.sh/|
-                            (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
+                            (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
                              (?:www\.)?invidious\.kabi\.tk/|
                              (?:www\.)?invidious\.13ad\.de/|
                              (?:www\.)?invidious\.mastodon\.host/|
+                            (?:www\.)?invidious\.zapashcanon\.fr/|
+                            (?:www\.)?invidious\.kavin\.rocks/|
+                            (?:www\.)?invidious\.tube/|
+                            (?:www\.)?invidiou\.site/|
+                            (?:www\.)?invidious\.site/|
+                            (?:www\.)?invidious\.xyz/|
                              (?:www\.)?invidious\.nixnet\.xyz/|
                              (?:www\.)?invidious\.drycat\.fr/|
                              (?:www\.)?tube\.poal\.co/|
+                            (?:www\.)?tube\.connect\.cafe/|
                              (?:www\.)?vid\.wxzm\.sx/|
+                            (?:www\.)?vid\.mint\.lgbt/|
                              (?:www\.)?yewtu\.be/|
                              (?:www\.)?yt\.elukerio\.org/|
                              (?:www\.)?yt\.lelux\.fi/|
@@ -506,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
          '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
      }
-    _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')  # TODO 'json3' raising issues with automatic captions
+    _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  
      _GEO_BYPASS = False
  
@@ -1092,7 +1104,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              },
          },
          {
-            # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
+            # with '};' inside yt initial data (see [1])
+            # see [2] for an example with '};' inside ytInitialPlayerResponse
+            # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
+            # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
              'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
              'info_dict': {
                  'id': 'CHqg6qOn4no',
@@ -1107,6 +1122,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'skip_download': True,
              },
          },
+        {
+            # another example of '};' in ytInitialData
+            'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
+            'only_matching': True,
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -1335,17 +1359,16 @@ def _get_ytplayer_config(self, video_id, webpage):
              return self._parse_json(
                  uppercase_escape(config), video_id, fatal=False)
  
-    def _get_automatic_captions(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, player_response, player_config):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
          self.to_screen('%s: Looking for automatic captions' % video_id)
-        player_config = self._get_ytplayer_config(video_id, webpage)
          err_msg = 'Couldn\'t find automatic captions for %s' % video_id
-        if not player_config:
+        if not (player_response or player_config):
              self._downloader.report_warning(err_msg)
              return {}
          try:
-            args = player_config['args']
+            args = player_config.get('args') if player_config else {}
              caption_url = args.get('ttsurl')
              if caption_url:
                  timestamp = args['timestamp']
@@ -1404,19 +1427,15 @@ def make_captions(sub_url, sub_langs):
                  return captions
  
              # New captions format as of 22.06.2017
-            player_response = args.get('player_response')
-            if player_response and isinstance(player_response, compat_str):
-                player_response = self._parse_json(
-                    player_response, video_id, fatal=False)
-                if player_response:
-                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                    base_url = renderer['captionTracks'][0]['baseUrl']
-                    sub_lang_list = []
-                    for lang in renderer['translationLanguages']:
-                        lang_code = lang.get('languageCode')
-                        if lang_code:
-                            sub_lang_list.append(lang_code)
-                    return make_captions(base_url, sub_lang_list)
+            if player_response:
+                renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                base_url = renderer['captionTracks'][0]['baseUrl']
+                sub_lang_list = []
+                for lang in renderer['translationLanguages']:
+                    lang_code = lang.get('languageCode')
+                    if lang_code:
+                        sub_lang_list.append(lang_code)
+                return make_captions(base_url, sub_lang_list)
  
              # Some videos don't provide ttsurl but rather caption_tracks and
              # caption_translation_languages (e.g. 20LmZk1hakA)
@@ -1771,7 +1790,8 @@ def extract_embedded_config(embed_webpage, video_id):
          if not video_info and not player_response:
              player_response = extract_player_response(
                  self._search_regex(
-                    r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
+                    (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
+                     self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
                      'initial player response', default='{}'),
                  video_id)
  
@@ -2352,7 +2372,7 @@ def _extract_count(count_name):
          # subtitles
          video_subtitles = self.extract_subtitles(
              video_id, video_webpage, has_live_chat_replay)
-        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+        automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
  
          video_duration = try_get(
              video_info, lambda x: int_or_none(x['length_seconds'][0]))
@@ -2373,16 +2393,25 @@ def _extract_count(count_name):
          # annotations
          video_annotations = None
          if self._downloader.params.get('writeannotations', False):
-            xsrf_token = self._search_regex(
-                r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
-                video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
+            xsrf_token = None
+            ytcfg = self._extract_ytcfg(video_id, video_webpage)
+            if ytcfg:
+                xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
+            if not xsrf_token:
+                xsrf_token = self._search_regex(
+                    r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
+                    video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
              invideo_url = try_get(
                  player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
              if xsrf_token and invideo_url:
-                xsrf_field_name = self._search_regex(
-                    r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
-                    video_webpage, 'xsrf field name',
-                    group='xsrf_field_name', default='session_token')
+                xsrf_field_name = None
+                if ytcfg:
+                    xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
+                if not xsrf_field_name:
+                    xsrf_field_name = self._search_regex(
+                        r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
+                        video_webpage, 'xsrf field name',
+                        group='xsrf_field_name', default='session_token')
                  video_annotations = self._download_webpage(
                      self._proto_relative_url(invideo_url),
                      video_id, note='Downloading annotations',
@@ -2526,7 +2555,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
                                  feed/|
                                  (?:playlist|watch)\?.*?\blist=
                              )|
-                            (?!(%s)([/#?]|$))  # Direct URLs
+                            (?!(?:%s)\b)  # Direct URLs
                          )
                          (?P<id>[^/?\#&]+)
                      ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
@@ -2791,13 +2820,31 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
          # no longer available?
          'url': 'https://www.youtube.com/feed/recommended',
          'only_matching': True,
-    }
-        # TODO
-        # {
-        #     'url': 'https://www.youtube.com/TheYoungTurks/live',
-        #     'only_matching': True,
-        # }
-    ]
+    }, {
+        # inline playlist with not always working continuations
+        'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/course',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/zsecurity',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.youtube.com/NASAgovVideo/videos',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/TheYoungTurks/live',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if YoutubeIE.suitable(url) else super(
+            YoutubeTabIE, cls).suitable(url)
  
      def _extract_channel_id(self, webpage):
          channel_id = self._html_search_meta(
@@ -2894,12 +2941,17 @@ def _shelf_entries_from_content(self, shelf_renderer):
              # TODO
              pass
  
-    def _shelf_entries(self, shelf_renderer):
+    def _shelf_entries(self, shelf_renderer, skip_channels=False):
          ep = try_get(
              shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
              compat_str)
          shelf_url = urljoin('https://www.youtube.com', ep)
          if shelf_url:
+            # Skipping links to another channels, note that checking for
+            # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
+            # will not work
+            if skip_channels and '/channels?' in shelf_url:
+                return
              title = try_get(
                  shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
              yield self.url_result(shelf_url, video_title=title)
@@ -2986,6 +3038,16 @@ def _post_thread_continuation_entries(self, post_thread_continuation):
              for entry in self._post_thread_entries(renderer):
                  yield entry
  
+    @staticmethod
+    def _build_continuation_query(continuation, ctp=None):
+        query = {
+            'ctoken': continuation,
+            'continuation': continuation,
+        }
+        if ctp:
+            query['itct'] = ctp
+        return query
+
      @staticmethod
      def _extract_next_continuation_data(renderer):
          next_continuation = try_get(
@@ -2996,11 +3058,7 @@ def _extract_next_continuation_data(renderer):
          if not continuation:
              return
          ctp = next_continuation.get('clickTrackingParams')
-        return {
-            'ctoken': continuation,
-            'continuation': continuation,
-            'itct': ctp,
-        }
+        return YoutubeTabIE._build_continuation_query(continuation, ctp)
  
      @classmethod
      def _extract_continuation(cls, renderer):
@@ -3023,13 +3081,7 @@ def _extract_continuation(cls, renderer):
              if not continuation:
                  continue
              ctp = continuation_ep.get('clickTrackingParams')
-            if not ctp:
-                continue
-            return {
-                'ctoken': continuation,
-                'continuation': continuation,
-                'itct': ctp,
-            }
+            return YoutubeTabIE._build_continuation_query(continuation, ctp)
  
      def _entries(self, tab, identity_token):
  
@@ -3064,7 +3116,8 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                          continue
                      renderer = isr_content.get('shelfRenderer')
                      if renderer:
-                        for entry in self._shelf_entries(renderer):
+                        is_channels_tab = tab.get('title') == 'Channels'
+                        for entry in self._shelf_entries(renderer, not is_channels_tab):
                              yield entry
                          continue
                      renderer = isr_content.get('backstagePostThreadRenderer')
@@ -3086,9 +3139,12 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                  continuation_list[0] = self._extract_continuation(parent_renderer)
  
          continuation_list = [None]  # Python 2 doesnot support nonlocal
+        tab_content = try_get(tab, lambda x: x['content'], dict)
+        if not tab_content:
+            return
          parent_renderer = (
-            try_get(tab, lambda x: x['sectionListRenderer'], dict)
-            or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
+            try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
+            or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
          for entry in extract_entries(parent_renderer):
              yield entry
          continuation = continuation_list[0]
@@ -3103,10 +3159,24 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
          for page_num in itertools.count(1):
              if not continuation:
                  break
-            browse = self._download_json(
-                'https://www.youtube.com/browse_ajax', None,
-                'Downloading page %d' % page_num,
-                headers=headers, query=continuation, fatal=False)
+            count = 0
+            retries = 3
+            while count <= retries:
+                try:
+                    # Downloading page may result in intermittent 5xx HTTP error
+                    # that is usually worked around with a retry
+                    browse = self._download_json(
+                        'https://www.youtube.com/browse_ajax', None,
+                        'Downloading page %d%s'
+                        % (page_num, ' (retry #%d)' % count if count else ''),
+                        headers=headers, query=continuation)
+                    break
+                except ExtractorError as e:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
+                        count += 1
+                        if count <= retries:
+                            continue
+                    raise
              if not browse:
                  break
              response = try_get(browse, lambda x: x[1]['response'], dict)
@@ -3212,22 +3282,35 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
          if title is None:
              title = "Youtube " + playlist_id.title()
          playlist = self.playlist_result(
-            self._entries(selected_tab['content'], identity_token),
+            self._entries(selected_tab, identity_token),
              playlist_id=playlist_id, playlist_title=title,
              playlist_description=description)
          playlist.update(self._extract_uploader(data))
          return playlist
  
-    def _extract_from_playlist(self, item_id, data, playlist):
+    def _extract_from_playlist(self, item_id, url, data, playlist):
          title = playlist.get('title') or try_get(
              data, lambda x: x['titleText']['simpleText'], compat_str)
          playlist_id = playlist.get('playlistId') or item_id
+        # Inline playlist rendition continuation does not always work
+        # at Youtube side, so delegating regular tab-based playlist URL
+        # processing whenever possible.
+        playlist_url = urljoin(url, try_get(
+            playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
+            compat_str))
+        if playlist_url and playlist_url != url:
+            return self.url_result(
+                playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
+                video_title=title)
          return self.playlist_result(
              self._playlist_entries(playlist), playlist_id=playlist_id,
              playlist_title=title)
  
-    def _extract_alerts(self, data):
+    @staticmethod
+    def _extract_alerts(data):
          for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
+            if not isinstance(alert_dict, dict):
+                continue
              for renderer in alert_dict:
                  alert = alert_dict[renderer]
                  alert_type = alert.get('type')
@@ -3241,6 +3324,16 @@ def _extract_alerts(self, data):
                      if message:
                          yield alert_type, message
  
+    def _extract_identity_token(self, webpage, item_id):
+        ytcfg = self._extract_ytcfg(item_id, webpage)
+        if ytcfg:
+            token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
+            if token:
+                return token
+        return self._search_regex(
+            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+            'identity token', default=None)
+
      def _real_extract(self, url):
          item_id = self._match_id(url)
          url = compat_urlparse.urlunparse(
@@ -3257,7 +3350,7 @@ def _real_extract(self, url):
          video_id = qs.get('v', [None])[0]
          playlist_id = qs.get('list', [None])[0]
  
-        if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
+        if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
              if playlist_id:
                  self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
                  url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
@@ -3271,9 +3364,7 @@ def _real_extract(self, url):
              self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
          webpage = self._download_webpage(url, item_id)
-        identity_token = self._search_regex(
-            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
-            'identity token', default=None)
+        identity_token = self._extract_identity_token(webpage, item_id)
          data = self._extract_yt_initial_data(item_id, webpage)
          for alert_type, alert_message in self._extract_alerts(data):
              self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
@@ -3284,7 +3375,7 @@ def _real_extract(self, url):
          playlist = try_get(
              data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
          if playlist:
-            return self._extract_from_playlist(item_id, data, playlist)
+            return self._extract_from_playlist(item_id, url, data, playlist)
          # Fallback to video extraction if no playlist alike page is recognized.
          # First check for the current video then try the v attribute of URL query.
          video_id = try_get(
@@ -3304,8 +3395,7 @@ class YoutubePlaylistIE(InfoExtractor):
                          (?:
                              (?:
                                  youtube(?:kids)?\.com|
-                                invidio\.us|
-                                youtu\.be
+                                invidio\.us
                              )
                              /.*?\?.*?\blist=
                          )?
@@ -3350,6 +3440,32 @@ class YoutubePlaylistIE(InfoExtractor):
              'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
          }
      }, {
+        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
+        'only_matching': True,
+    }, {
+        # music album playlist
+        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if YoutubeTabIE.suitable(url) else super(
+            YoutubePlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        if not qs:
+            qs = {'list': playlist_id}
+        return self.url_result(
+            update_url_query('https://www.youtube.com/playlist', qs),
+            ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+
+
+class YoutubeYtBeIE(InfoExtractor):
+    _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+    _TESTS = [{
          'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
          'info_dict': {
              'id': 'yeWKywCrFtk',
@@ -3372,28 +3488,18 @@ class YoutubePlaylistIE(InfoExtractor):
      }, {
          'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
          'only_matching': True,
-    }, {
-        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
-        'only_matching': True,
-    }, {
-        # music album playlist
-        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
-        'only_matching': True,
      }]
  
-    @classmethod
-    def suitable(cls, url):
-        return False if YoutubeTabIE.suitable(url) else super(
-            YoutubePlaylistIE, cls).suitable(url)
-
      def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-        if not qs:
-            qs = {'list': playlist_id}
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        playlist_id = mobj.group('playlist_id')
          return self.url_result(
-            update_url_query('https://www.youtube.com/playlist', qs),
-            ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+            update_url_query('https://www.youtube.com/watch', {
+                'v': video_id,
+                'list': playlist_id,
+                'feature': 'youtu.be',
+            }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
  
  
  class YoutubeYtUserIE(InfoExtractor):
diff --git a/youtube_dlc/extractor/zdf.py b/youtube_dlc/extractor/zdf.py

index 7b5ad4a6e85398331fe4c7926cbc378f3b624f0f..d9b393e6e76d658cda6565ab41c560d047ac0226 100644 (file)
--- a/youtube_dlc/extractor/zdf.py
+++ b/youtube_dlc/extractor/zdf.py
@@ -41,7 +41,7 @@ def _extract_player(self, webpage, video_id, fatal=True):
  class ZDFIE(ZDFBaseIE):
      IE_NAME = "ZDF-3sat"
      _VALID_URL = r'https?://www\.(zdf|3sat)\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html'
-    _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh')
+    _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
      _GEO_COUNTRIES = ['DE']
  
      _TESTS = [{
@@ -131,7 +131,7 @@ def _extract_entry(self, url, player, content, video_id):
          if not ptmd_path:
              ptmd_path = t[
                  'http://zdf.de/rels/streams/ptmd-template'].replace(
-                '{playerId}', 'portal')
+                '{playerId}', 'ngplayer_2_4')
  
          ptmd = self._call_api(
              urljoin(url, ptmd_path), player, url, video_id, 'metadata')
diff --git a/youtube_dlc/extractor/zype.py b/youtube_dlc/extractor/zype.py

index 2e2e97a0c4454971dab30136518ca26e903b9470..5288f40d8b895be8c4c40c2df901529798050ce1 100644 (file)
--- a/youtube_dlc/extractor/zype.py
+++ b/youtube_dlc/extractor/zype.py
@@ -85,7 +85,13 @@ def _real_extract(self, url):
          else:
              m3u8_url = self._search_regex(
                  r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1',
-                body, 'm3u8 url', group='url')
+                body, 'm3u8 url', group='url', default=None)
+            if not m3u8_url:
+                source = self._parse_json(self._search_regex(
+                    r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body,
+                    'source'), video_id, js_to_json)
+                if source.get('integration') == 'verizon-media':
+                    m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id']
              formats = self._extract_m3u8_formats(
                  m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
              text_tracks = self._search_regex(
diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py

index 733eec8bd11af2194053c37589d8ec69a0c311a0..a48a3f1f12eba7e55074761da9e3826d0c1ce07a 100644 (file)
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@@ -394,7 +394,7 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser):
      authentication.add_option(
          '--video-password',
          dest='videopassword', metavar='PASSWORD',
-        help='Video password (vimeo, smotri, youku)')
+        help='Video password (vimeo, youku)')
  
      adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options')
      adobe_pass.add_option(
diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py

index 8c2c377af5463415dd4a31e2a025d4e47e52cfc6..bc41f5498463d4350eba7cd8d6ddcc063248faf4 100644 (file)
--- a/youtube_dlc/utils.py
+++ b/youtube_dlc/utils.py
@@ -3650,7 +3650,7 @@ def url_or_none(url):
      if not url or not isinstance(url, compat_str):
          return None
      url = url.strip()
-    return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
+    return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
  
  
  def parse_duration(s):
author	pukkandan <redacted>
	Fri, 1 Jan 2021 12:26:37 +0000 (17:56 +0530)
committer	pukkandan <redacted>
	Mon, 4 Jan 2021 18:32:27 +0000 (00:02 +0530)
README.md		patch \| blob \| blame \| history
docs/supportedsites.md		patch \| blob \| blame \| history
test/test_InfoExtractor.py		patch \| blob \| blame \| history
test/test_all_urls.py		patch \| blob \| blame \| history
test/test_utils.py		patch \| blob \| blame \| history
youtube_dlc/YoutubeDL.py		patch \| blob \| blame \| history
youtube_dlc/downloader/hls.py		patch \| blob \| blame \| history
youtube_dlc/extractor/acast.py		patch \| blob \| blame \| history
youtube_dlc/extractor/aenetworks.py		patch \| blob \| blame \| history
youtube_dlc/extractor/amcnetworks.py		patch \| blob \| blame \| history
youtube_dlc/extractor/americastestkitchen.py		patch \| blob \| blame \| history
youtube_dlc/extractor/anvato.py		patch \| blob \| blame \| history
youtube_dlc/extractor/anvato_token_generator/__init__.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/anvato_token_generator/common.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/anvato_token_generator/nfl.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/aparat.py		patch \| blob \| blame \| history
youtube_dlc/extractor/arcpublishing.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/arkena.py		patch \| blob \| blame \| history
youtube_dlc/extractor/asiancrush.py		patch \| blob \| blame \| history
youtube_dlc/extractor/bbc.py		patch \| blob \| blame \| history
youtube_dlc/extractor/beampro.py	[deleted file]	patch \| blob \| blame \| history
youtube_dlc/extractor/bongacams.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/brightcove.py		patch \| blob \| blame \| history
youtube_dlc/extractor/cbslocal.py		patch \| blob \| blame \| history
youtube_dlc/extractor/cnn.py		patch \| blob \| blame \| history
youtube_dlc/extractor/common.py		patch \| blob \| blame \| history
youtube_dlc/extractor/cspan.py		patch \| blob \| blame \| history
youtube_dlc/extractor/ctv.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/drtv.py		patch \| blob \| blame \| history
youtube_dlc/extractor/eporner.py		patch \| blob \| blame \| history
youtube_dlc/extractor/extractors.py		patch \| blob \| blame \| history
youtube_dlc/extractor/facebook.py		patch \| blob \| blame \| history
youtube_dlc/extractor/fujitv.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/gamespot.py		patch \| blob \| blame \| history
youtube_dlc/extractor/generic.py		patch \| blob \| blame \| history
youtube_dlc/extractor/go.py		patch \| blob \| blame \| history
youtube_dlc/extractor/instagram.py		patch \| blob \| blame \| history
youtube_dlc/extractor/itv.py		patch \| blob \| blame \| history
youtube_dlc/extractor/lbry.py		patch \| blob \| blame \| history
youtube_dlc/extractor/linuxacademy.py		patch \| blob \| blame \| history
youtube_dlc/extractor/mdr.py		patch \| blob \| blame \| history
youtube_dlc/extractor/mediaset.py		patch \| blob \| blame \| history
youtube_dlc/extractor/mitele.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nba.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nbc.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nfl.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nhk.py		patch \| blob \| blame \| history
youtube_dlc/extractor/niconico.py		patch \| blob \| blame \| history
youtube_dlc/extractor/ninecninemedia.py		patch \| blob \| blame \| history
youtube_dlc/extractor/nrk.py		patch \| blob \| blame \| history
youtube_dlc/extractor/peertube.py		patch \| blob \| blame \| history
youtube_dlc/extractor/piksel.py		patch \| blob \| blame \| history
youtube_dlc/extractor/pornhub.py		patch \| blob \| blame \| history
youtube_dlc/extractor/reddit.py		patch \| blob \| blame \| history
youtube_dlc/extractor/ruutu.py		patch \| blob \| blame \| history
youtube_dlc/extractor/sevenplus.py		patch \| blob \| blame \| history
youtube_dlc/extractor/sky.py		patch \| blob \| blame \| history
youtube_dlc/extractor/slideslive.py		patch \| blob \| blame \| history
youtube_dlc/extractor/smotri.py	[deleted file]	patch \| blob \| blame \| history
youtube_dlc/extractor/sonyliv.py		patch \| blob \| blame \| history
youtube_dlc/extractor/spankbang.py		patch \| blob \| blame \| history
youtube_dlc/extractor/sprout.py		patch \| blob \| blame \| history
youtube_dlc/extractor/stitcher.py		patch \| blob \| blame \| history
youtube_dlc/extractor/streetvoice.py		patch \| blob \| blame \| history
youtube_dlc/extractor/teachable.py		patch \| blob \| blame \| history
youtube_dlc/extractor/telecinco.py		patch \| blob \| blame \| history
youtube_dlc/extractor/telequebec.py		patch \| blob \| blame \| history
youtube_dlc/extractor/tenplay.py		patch \| blob \| blame \| history
youtube_dlc/extractor/theplatform.py		patch \| blob \| blame \| history
youtube_dlc/extractor/theweatherchannel.py		patch \| blob \| blame \| history
youtube_dlc/extractor/toggle.py		patch \| blob \| blame \| history
youtube_dlc/extractor/tubitv.py		patch \| blob \| blame \| history
youtube_dlc/extractor/turner.py		patch \| blob \| blame \| history
youtube_dlc/extractor/tv5unis.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/tva.py		patch \| blob \| blame \| history
youtube_dlc/extractor/tver.py	[new file with mode: 0644]	patch \| blob
youtube_dlc/extractor/tvplay.py		patch \| blob \| blame \| history
youtube_dlc/extractor/twitcasting.py		patch \| blob \| blame \| history
youtube_dlc/extractor/uktvplay.py		patch \| blob \| blame \| history
youtube_dlc/extractor/videa.py		patch \| blob \| blame \| history
youtube_dlc/extractor/videomore.py		patch \| blob \| blame \| history
youtube_dlc/extractor/viki.py		patch \| blob \| blame \| history
youtube_dlc/extractor/vimeo.py		patch \| blob \| blame \| history
youtube_dlc/extractor/vlive.py		patch \| blob \| blame \| history
youtube_dlc/extractor/vvvvid.py		patch \| blob \| blame \| history
youtube_dlc/extractor/washingtonpost.py		patch \| blob \| blame \| history
youtube_dlc/extractor/wdr.py		patch \| blob \| blame \| history
youtube_dlc/extractor/wistia.py		patch \| blob \| blame \| history
youtube_dlc/extractor/yandexdisk.py		patch \| blob \| blame \| history
youtube_dlc/extractor/yandexmusic.py		patch \| blob \| blame \| history
youtube_dlc/extractor/yandexvideo.py		patch \| blob \| blame \| history
youtube_dlc/extractor/youtube.py		patch \| blob \| blame \| history
youtube_dlc/extractor/zdf.py		patch \| blob \| blame \| history
youtube_dlc/extractor/zype.py		patch \| blob \| blame \| history
youtube_dlc/options.py		patch \| blob \| blame \| history
youtube_dlc/utils.py		patch \| blob \| blame \| history