]> jfr.im git - yt-dlp.git/commitdiff
Update to ytdl-commit-4fb25ff
authorpukkandan <redacted>
Sat, 10 Apr 2021 16:47:11 +0000 (22:17 +0530)
committerpukkandan <redacted>
Sat, 10 Apr 2021 16:49:54 +0000 (22:19 +0530)
[maoritv] Add new extractor
https://github.com/ytdl-org/youtube-dl/commit/4fb25ff5a3be5206bb72e5c4046715b1529fb2c7

Except:
[vimeo] improve extraction https://github.com/ytdl-org/youtube-dl/commit/3ae9c0f410b1d4f63e8bada67dd62a8d2852be32
[youtube:tab] Pass innertube context... https://github.com/ytdl-org/youtube-dl/commit/1b0a13f33cfb3644cc718d35951ea85bb1905459

12 files changed:
yt_dlp/compat.py
yt_dlp/extractor/cbssports.py
yt_dlp/extractor/common.py
yt_dlp/extractor/curiositystream.py
yt_dlp/extractor/extractors.py
yt_dlp/extractor/jamendo.py
yt_dlp/extractor/line.py
yt_dlp/extractor/maoritv.py [new file with mode: 0644]
yt_dlp/extractor/mtv.py
yt_dlp/extractor/pornhub.py
yt_dlp/extractor/youku.py
yt_dlp/extractor/youtube.py

index 9659d480e12934a2d6f7695c3adfd1a7d0f5e21d..3ebf1ee7a6f02f48dc7a14bd85c1f87b685a9687 100644 (file)
@@ -78,6 +78,15 @@ def __init__(self, version, name, value, *args, **kwargs):
 except ImportError:  # Python 2
     import Cookie as compat_cookies
 
+if sys.version_info[0] == 2:
+    class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):
+        def load(self, rawdata):
+            if isinstance(rawdata, compat_str):
+                rawdata = str(rawdata)
+            return super(compat_cookies_SimpleCookie, self).load(rawdata)
+else:
+    compat_cookies_SimpleCookie = compat_cookies.SimpleCookie
+
 try:
     import html.entities as compat_html_entities
 except ImportError:  # Python 2
@@ -3020,6 +3029,7 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
     'compat_cookiejar',
     'compat_cookiejar_Cookie',
     'compat_cookies',
+    'compat_cookies_SimpleCookie',
     'compat_ctypes_WINFUNCTYPE',
     'compat_etree_Element',
     'compat_etree_fromstring',
index 83b76476245d553f6f9bc976723f2573dd6c72c2..a891c9a5572c2ebb1ea51af234869055f335cf1a 100644 (file)
 from __future__ import unicode_literals
 
-from .cbs import CBSBaseIE
+import re
 
+# from .cbs import CBSBaseIE
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    try_get,
+)
 
-class CBSSportsIE(CBSBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)'
 
+# class CBSSportsEmbedIE(CBSBaseIE):
+class CBSSportsEmbedIE(InfoExtractor):
+    IE_NAME = 'cbssports:embed'
+    _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+?
+        (?:
+            ids%3D(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})|
+            pcid%3D(?P<pcid>\d+)
+        )'''
     _TESTS = [{
-        'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/',
-        'info_dict': {
-            'id': '1214315075735',
-            'ext': 'mp4',
-            'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder',
-            'description': 'md5:df6f48622612c2d6bd2e295ddef58def',
-            'timestamp': 1524111457,
-            'upload_date': '20180419',
-            'uploader': 'CBSI-NEW',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        }
+        'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod',
+        'only_matching': True,
     }, {
-        'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/',
+        'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue',
         'only_matching': True,
     }]
 
-    def _extract_video_info(self, filter_query, video_id):
-        return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
+    def _extract_video_info(self, filter_query, video_id):
+        return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
 
+    def _real_extract(self, url):
+        uuid, pcid = re.match(self._VALID_URL, url).groups()
+        query = {'id': uuid} if uuid else {'pcid': pcid}
+        video = self._download_json(
+            'https://www.cbssports.com/api/content/video/',
+            uuid or pcid, query=query)[0]
+        video_id = video['id']
+        title = video['title']
+        metadata = video.get('metaData') or {}
+        # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id)
+        # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id)
+
+        formats = self._extract_m3u8_formats(
+            metadata['files'][0]['url'], video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False)
+        self._sort_formats(formats)
+
+        image = video.get('image')
+        thumbnails = None
+        if image:
+            image_path = image.get('path')
+            if image_path:
+                thumbnails = [{
+                    'url': image_path,
+                    'width': int_or_none(image.get('width')),
+                    'height': int_or_none(image.get('height')),
+                    'filesize': int_or_none(image.get('size')),
+                }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'description': video.get('description'),
+            'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])),
+            'duration': int_or_none(metadata.get('duration')),
+        }
+
+
+class CBSSportsBaseIE(InfoExtractor):
     def _real_extract(self, url):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
-        video_id = self._search_regex(
-            [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'],
-            webpage, 'video id')
-        return self._extract_video_info('byId=%s' % video_id, video_id)
+        iframe_url = self._search_regex(
+            r'<iframe[^>]+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"',
+            webpage, 'embed url')
+        return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key())
+
+
+class CBSSportsIE(CBSSportsBaseIE):
+    IE_NAME = 'cbssports'
+    _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/',
+        'info_dict': {
+            'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636',
+            'ext': 'mp4',
+            'title': 'Cover 3: Stanford Spring Gleaning',
+            'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.',
+            'timestamp': 1617218398,
+            'upload_date': '20210331',
+            'duration': 502,
+        },
+    }]
+
+
+class TwentyFourSevenSportsIE(CBSSportsBaseIE):
+    IE_NAME = '247sports'
+    _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/',
+        'info_dict': {
+            'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc',
+            'ext': 'mp4',
+            'title': '2021 QB Jake Garcia senior highlights through five games',
+            'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b',
+            'timestamp': 1607114223,
+            'upload_date': '20201204',
+            'duration': 208,
+        },
+    }]
index be3164ede1c6bb4149a6dca1540e67bf0199c7f0..79f9c403573f876d792be931bc854e4ab579a263 100644 (file)
@@ -17,7 +17,7 @@
 
 from ..compat import (
     compat_cookiejar_Cookie,
-    compat_cookies,
+    compat_cookies_SimpleCookie,
     compat_etree_Element,
     compat_etree_fromstring,
     compat_getpass,
@@ -1308,6 +1308,7 @@ def extract_interaction_statistic(e):
 
         def extract_video_object(e):
             assert e['@type'] == 'VideoObject'
+            author = e.get('author')
             info.update({
                 'url': url_or_none(e.get('contentUrl')),
                 'title': unescapeHTML(e.get('name')),
@@ -1315,7 +1316,11 @@ def extract_video_object(e):
                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
                 'duration': parse_duration(e.get('duration')),
                 'timestamp': unified_timestamp(e.get('uploadDate')),
-                'uploader': str_or_none(e.get('author')),
+                # author can be an instance of 'Organization' or 'Person' types.
+                # both types can have 'name' property(inherited from 'Thing' type). [1]
+                # however some websites are using 'Text' type instead.
+                # 1. https://schema.org/VideoObject
+                'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
                 'filesize': float_or_none(e.get('contentSize')),
                 'tbr': int_or_none(e.get('bitrate')),
                 'width': int_or_none(e.get('width')),
@@ -3219,13 +3224,10 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
         self._downloader.cookiejar.set_cookie(cookie)
 
     def _get_cookies(self, url):
-        """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+        """ Return a compat_cookies_SimpleCookie with the cookies for the url """
         req = sanitized_Request(url)
         self._downloader.cookiejar.add_cookie_header(req)
-        cookie = req.get_header('Cookie')
-        if cookie and sys.version_info[0] == 2:
-            cookie = str(cookie)
-        return compat_cookies.SimpleCookie(cookie)
+        return compat_cookies_SimpleCookie(req.get_header('Cookie'))
 
     def _apply_first_set_cookie_header(self, url_handle, cookie):
         """
index e4a7fca6ce1e7352a8ac2785c1fa3e73445c9cca..ae64a07d719caa6cf09540864e7cc962ad481ae6 100644 (file)
@@ -25,12 +25,12 @@ def _handle_errors(self, result):
             raise ExtractorError(
                 '%s said: %s' % (self.IE_NAME, error), expected=True)
 
-    def _call_api(self, path, video_id):
+    def _call_api(self, path, video_id, query=None):
         headers = {}
         if self._auth_token:
             headers['X-Auth-Token'] = self._auth_token
         result = self._download_json(
-            self._API_BASE_URL + path, video_id, headers=headers)
+            self._API_BASE_URL + path, video_id, headers=headers, query=query)
         self._handle_errors(result)
         return result['data']
 
@@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
     _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
     _TEST = {
         'url': 'https://app.curiositystream.com/video/2',
-        'md5': '262bb2f257ff301115f1973540de8983',
         'info_dict': {
             'id': '2',
             'ext': 'mp4',
             'title': 'How Did You Develop The Internet?',
             'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
-        }
+        },
+        'params': {
+            'format': 'bestvideo',
+            # m3u8 download
+            'skip_download': True,
+        },
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        media = self._call_api('media/' + video_id, video_id)
-        title = media['title']
 
         formats = []
-        for encoding in media.get('encodings', []):
-            m3u8_url = encoding.get('master_playlist_url')
-            if m3u8_url:
-                formats.extend(self._extract_m3u8_formats(
-                    m3u8_url, video_id, 'mp4', 'm3u8_native',
-                    m3u8_id='hls', fatal=False))
-            encoding_url = encoding.get('url')
-            file_url = encoding.get('file_url')
-            if not encoding_url and not file_url:
-                continue
-            f = {
-                'width': int_or_none(encoding.get('width')),
-                'height': int_or_none(encoding.get('height')),
-                'vbr': int_or_none(encoding.get('video_bitrate')),
-                'abr': int_or_none(encoding.get('audio_bitrate')),
-                'filesize': int_or_none(encoding.get('size_in_bytes')),
-                'vcodec': encoding.get('video_codec'),
-                'acodec': encoding.get('audio_codec'),
-                'container': encoding.get('container_type'),
-            }
-            for f_url in (encoding_url, file_url):
-                if not f_url:
+        for encoding_format in ('m3u8', 'mpd'):
+            media = self._call_api('media/' + video_id, video_id, query={
+                'encodingsNew': 'true',
+                'encodingsFormat': encoding_format,
+            })
+            for encoding in media.get('encodings', []):
+                playlist_url = encoding.get('master_playlist_url')
+                if encoding_format == 'm3u8':
+                    # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
+                    formats.extend(self._extract_m3u8_formats(
+                        playlist_url, video_id, 'mp4',
+                        m3u8_id='hls', fatal=False))
+                elif encoding_format == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        playlist_url, video_id, mpd_id='dash', fatal=False))
+                encoding_url = encoding.get('url')
+                file_url = encoding.get('file_url')
+                if not encoding_url and not file_url:
                     continue
-                fmt = f.copy()
-                rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
-                if rtmp:
-                    fmt.update({
-                        'url': rtmp.group('url'),
-                        'play_path': rtmp.group('playpath'),
-                        'app': rtmp.group('app'),
-                        'ext': 'flv',
-                        'format_id': 'rtmp',
-                    })
-                else:
-                    fmt.update({
-                        'url': f_url,
-                        'format_id': 'http',
-                    })
-                formats.append(fmt)
+                f = {
+                    'width': int_or_none(encoding.get('width')),
+                    'height': int_or_none(encoding.get('height')),
+                    'vbr': int_or_none(encoding.get('video_bitrate')),
+                    'abr': int_or_none(encoding.get('audio_bitrate')),
+                    'filesize': int_or_none(encoding.get('size_in_bytes')),
+                    'vcodec': encoding.get('video_codec'),
+                    'acodec': encoding.get('audio_codec'),
+                    'container': encoding.get('container_type'),
+                }
+                for f_url in (encoding_url, file_url):
+                    if not f_url:
+                        continue
+                    fmt = f.copy()
+                    rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+                    if rtmp:
+                        fmt.update({
+                            'url': rtmp.group('url'),
+                            'play_path': rtmp.group('playpath'),
+                            'app': rtmp.group('app'),
+                            'ext': 'flv',
+                            'format_id': 'rtmp',
+                        })
+                    else:
+                        fmt.update({
+                            'url': f_url,
+                            'format_id': 'http',
+                        })
+                    formats.append(fmt)
         self._sort_formats(formats)
 
+        title = media['title']
+
         subtitles = {}
         for closed_caption in media.get('closed_captions', []):
             sub_url = closed_caption.get('file')
@@ -140,7 +153,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
             'title': 'Curious Minds: The Internet',
             'description': 'How is the internet shaping our lives in the 21st Century?',
         },
-        'playlist_mincount': 17,
+        'playlist_mincount': 16,
     }, {
         'url': 'https://curiositystream.com/series/2',
         'only_matching': True,
index b9e3c4ad3598deaad4d66cf785b63bc51c6ff923..424406177b2f6cbcc01ffa74740c53051a8aa290 100644 (file)
     CBSNewsIE,
     CBSNewsLiveVideoIE,
 )
-from .cbssports import CBSSportsIE
+from .cbssports import (
+    CBSSportsEmbedIE,
+    CBSSportsIE,
+    TwentyFourSevenSportsIE,
+)
 from .ccc import (
     CCCIE,
     CCCPlaylistIE,
     LimelightChannelIE,
     LimelightChannelListIE,
 )
-from .line import LineTVIE
+from .line import (
+    LineTVIE,
+    LineLiveIE,
+    LineLiveChannelIE,
+)
 from .linkedin import (
     LinkedInLearningIE,
     LinkedInLearningCourseIE,
     MangomoloLiveIE,
 )
 from .manyvids import ManyVidsIE
+from .maoritv import MaoriTVIE
 from .markiza import (
     MarkizaIE,
     MarkizaPageIE,
index 490efa8fb4aadcbf626295338fbd8c40e818d32d..1db7c64afa821fc2a3020b7317670a5daa4ab41d 100644 (file)
@@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor):
             'id': '196219',
             'display_id': 'stories-from-emona-i',
             'ext': 'flac',
-            'title': 'Maya Filipič - Stories from Emona I',
-            'artist': 'Maya Filipič',
+            # 'title': 'Maya Filipič - Stories from Emona I',
+            'title': 'Stories from Emona I',
+            # 'artist': 'Maya Filipič',
             'track': 'Stories from Emona I',
             'duration': 210,
             'thumbnail': r're:^https?://.*\.jpg',
             'timestamp': 1217438117,
             'upload_date': '20080730',
+            'license': 'by-nc-nd',
+            'view_count': int,
+            'like_count': int,
+            'average_rating': int,
+            'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'],
         }
     }, {
         'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
         'only_matching': True,
     }]
 
+    def _call_api(self, resource, resource_id):
+        path = '/api/%ss' % resource
+        rand = compat_str(random.random())
+        return self._download_json(
+            'https://www.jamendo.com' + path, resource_id, query={
+                'id[]': resource_id,
+            }, headers={
+                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+            })[0]
+
     def _real_extract(self, url):
         track_id, display_id = self._VALID_URL_RE.match(url).groups()
-        webpage = self._download_webpage(
-            'https://www.jamendo.com/track/' + track_id, track_id)
-        models = self._parse_json(self._html_search_regex(
-            r"data-bundled-models='([^']+)",
-            webpage, 'bundled models'), track_id)
-        track = models['track']['models'][0]
+        # webpage = self._download_webpage(
+        #     'https://www.jamendo.com/track/' + track_id, track_id)
+        # models = self._parse_json(self._html_search_regex(
+        #     r"data-bundled-models='([^']+)",
+        #     webpage, 'bundled models'), track_id)
+        # track = models['track']['models'][0]
+        track = self._call_api('track', track_id)
         title = track_name = track['name']
-        get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
-        artist = get_model('artist')
-        artist_name = artist.get('name')
-        if artist_name:
-            title = '%s - %s' % (artist_name, title)
-        album = get_model('album')
+        get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+        artist = get_model('artist')
+        artist_name = artist.get('name')
+        if artist_name:
+            title = '%s - %s' % (artist_name, title)
+        album = get_model('album')
 
         formats = [{
             'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@@ -74,7 +91,7 @@ def _real_extract(self, url):
 
         urls = []
         thumbnails = []
-        for _, covers in track.get('cover', {}).items():
+        for covers in (track.get('cover') or {}).values():
             for cover_id, cover_url in covers.items():
                 if not cover_url or cover_url in urls:
                     continue
@@ -88,13 +105,14 @@ def _real_extract(self, url):
                 })
 
         tags = []
-        for tag in track.get('tags', []):
+        for tag in (track.get('tags') or []):
             tag_name = tag.get('name')
             if not tag_name:
                 continue
             tags.append(tag_name)
 
         stats = track.get('stats') or {}
+        license = track.get('licenseCC') or []
 
         return {
             'id': track_id,
@@ -103,11 +121,11 @@ def _real_extract(self, url):
             'title': title,
             'description': track.get('description'),
             'duration': int_or_none(track.get('duration')),
-            'artist': artist_name,
+            'artist': artist_name,
             'track': track_name,
-            'album': album.get('name'),
+            'album': album.get('name'),
             'formats': formats,
-            'license': '-'.join(track.get('licenseCC', [])) or None,
+            'license': '-'.join(license) if license else None,
             'timestamp': int_or_none(track.get('dateCreated')),
             'view_count': int_or_none(stats.get('listenedAll')),
             'like_count': int_or_none(stats.get('favorited')),
@@ -116,9 +134,9 @@ def _real_extract(self, url):
         }
 
 
-class JamendoAlbumIE(InfoExtractor):
+class JamendoAlbumIE(JamendoIE):
     _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
         'info_dict': {
             'id': '121486',
@@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor):
         'params': {
             'playlistend': 2
         }
-    }
-
-    def _call_api(self, resource, resource_id):
-        path = '/api/%ss' % resource
-        rand = compat_str(random.random())
-        return self._download_json(
-            'https://www.jamendo.com' + path, resource_id, query={
-                'id[]': resource_id,
-            }, headers={
-                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
-            })[0]
+    }]
 
     def _real_extract(self, url):
         album_id = self._match_id(url)
@@ -169,7 +177,7 @@ def _real_extract(self, url):
         album_name = album.get('name')
 
         entries = []
-        for track in album.get('tracks', []):
+        for track in (album.get('tracks') or []):
             track_id = track.get('id')
             if not track_id:
                 continue
index 7f5fa446ea451cdb49a7dccc1f76fc57a5813210..2526daa7716274ac31521b8d7b81d74f5bc9f9cb 100644 (file)
@@ -4,7 +4,13 @@
 import re
 
 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    str_or_none,
+)
 
 
 class LineTVIE(InfoExtractor):
@@ -88,3 +94,137 @@ def _real_extract(self, url):
                            for thumbnail in video_info.get('thumbnails', {}).get('list', [])],
             'view_count': video_info.get('meta', {}).get('count'),
         }
+
+
+class LineLiveBaseIE(InfoExtractor):
+    _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/'
+
+    def _parse_broadcast_item(self, item):
+        broadcast_id = compat_str(item['id'])
+        title = item['title']
+        is_live = item.get('isBroadcastingNow')
+
+        thumbnails = []
+        for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items():
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail_url,
+            })
+
+        channel = item.get('channel') or {}
+        channel_id = str_or_none(channel.get('id'))
+
+        return {
+            'id': broadcast_id,
+            'title': self._live_title(title) if is_live else title,
+            'thumbnails': thumbnails,
+            'timestamp': int_or_none(item.get('createdAt')),
+            'channel': channel.get('name'),
+            'channel_id': channel_id,
+            'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None,
+            'duration': int_or_none(item.get('archiveDuration')),
+            'view_count': int_or_none(item.get('viewerCount')),
+            'comment_count': int_or_none(item.get('chatCount')),
+            'is_live': is_live,
+        }
+
+
+class LineLiveIE(LineLiveBaseIE):
+    _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://live.line.me/channels/4867368/broadcast/16331360',
+        'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3',
+        'info_dict': {
+            'id': '16331360',
+            'title': '振りコピ講座😙😙😙',
+            'ext': 'mp4',
+            'timestamp': 1617095132,
+            'upload_date': '20210330',
+            'channel': '白川ゆめか',
+            'channel_id': '4867368',
+            'view_count': int,
+            'comment_count': int,
+            'is_live': False,
+        }
+    }, {
+        # archiveStatus == 'DELETED'
+        'url': 'https://live.line.me/channels/4778159/broadcast/16378488',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        channel_id, broadcast_id = re.match(self._VALID_URL, url).groups()
+        broadcast = self._download_json(
+            self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id),
+            broadcast_id)
+        item = broadcast['item']
+        info = self._parse_broadcast_item(item)
+        protocol = 'm3u8' if info['is_live'] else 'm3u8_native'
+        formats = []
+        for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items():
+            if not v:
+                continue
+            if k == 'abr':
+                formats.extend(self._extract_m3u8_formats(
+                    v, broadcast_id, 'mp4', protocol,
+                    m3u8_id='hls', fatal=False))
+                continue
+            f = {
+                'ext': 'mp4',
+                'format_id': 'hls-' + k,
+                'protocol': protocol,
+                'url': v,
+            }
+            if not k.isdigit():
+                f['vcodec'] = 'none'
+            formats.append(f)
+        if not formats:
+            archive_status = item.get('archiveStatus')
+            if archive_status != 'ARCHIVED':
+                raise ExtractorError('this video has been ' + archive_status.lower(), expected=True)
+        self._sort_formats(formats)
+        info['formats'] = formats
+        return info
+
+
+class LineLiveChannelIE(LineLiveBaseIE):
+    _VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)'
+    _TEST = {
+        'url': 'https://live.line.me/channels/5893542',
+        'info_dict': {
+            'id': '5893542',
+            'title': 'いくらちゃん',
+            'description': 'md5:c3a4af801f43b2fac0b02294976580be',
+        },
+        'playlist_mincount': 29
+    }
+
+    def _archived_broadcasts_entries(self, archived_broadcasts, channel_id):
+        while True:
+            for row in (archived_broadcasts.get('rows') or []):
+                share_url = str_or_none(row.get('shareURL'))
+                if not share_url:
+                    continue
+                info = self._parse_broadcast_item(row)
+                info.update({
+                    '_type': 'url',
+                    'url': share_url,
+                    'ie_key': LineLiveIE.ie_key(),
+                })
+                yield info
+            if not archived_broadcasts.get('hasNextPage'):
+                return
+            archived_broadcasts = self._download_json(
+                self._API_BASE_URL + channel_id + '/archived_broadcasts',
+                channel_id, query={
+                    'lastId': info['id'],
+                })
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        channel = self._download_json(self._API_BASE_URL + channel_id, channel_id)
+        return self.playlist_result(
+            self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id),
+            channel_id, channel.get('title'), channel.get('information'))
diff --git a/yt_dlp/extractor/maoritv.py b/yt_dlp/extractor/maoritv.py
new file mode 100644 (file)
index 0000000..0d23fec
--- /dev/null
@@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MaoriTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54',
+        'md5': '5ade8ef53851b6a132c051b1cd858899',
+        'info_dict': {
+            'id': '4774724855001',
+            'ext': 'mp4',
+            'title': 'Kōrero Mai, Series 1 Episode 54',
+            'upload_date': '20160226',
+            'timestamp': 1456455018,
+            'description': 'md5:59bde32fd066d637a1a55794c56d8dcb',
+            'uploader_id': '1614493167001',
+        },
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        brightcove_id = self._search_regex(
+            r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id')
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+            'BrightcoveNew', brightcove_id)
index 4cf178b04a9296508e7125f156b0d8c01a8e125f..510f1439e5a7e618ff619a9a770db892f6edae03 100644 (file)
@@ -275,7 +275,9 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
 
     @staticmethod
     def _extract_child_with_type(parent, t):
-        return next(c for c in parent['children'] if c.get('type') == t)
+        for c in parent['children']:
+            if c.get('type') == t:
+                return c
 
     def _extract_mgid(self, webpage):
         try:
@@ -306,7 +308,8 @@ def _extract_mgid(self, webpage):
             data = self._parse_json(self._search_regex(
                 r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
             main_container = self._extract_child_with_type(data, 'MainContainer')
-            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
+            ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
+            video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
             mgid = video_player['props']['media']['video']['config']['uri']
 
         return mgid
index 2a7818e41bcc63ae1876f350e28c4e1c14732c88..031454600266de40f0ef8343ba4ce5b9aecc8c4f 100644 (file)
@@ -398,6 +398,16 @@ def parse_quality_items(quality_items):
         formats = []
 
         def add_format(format_url, height=None):
+            ext = determine_ext(format_url)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+                return
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                return
             tbr = None
             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url)
             if mobj:
@@ -417,16 +427,6 @@ def add_format(format_url, height=None):
                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
                 if upload_date:
                     upload_date = upload_date.replace('/', '')
-            ext = determine_ext(video_url)
-            if ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    video_url, video_id, mpd_id='dash', fatal=False))
-                continue
-            elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
-                continue
             if '/video/get_media' in video_url:
                 medias = self._download_json(video_url, video_id, fatal=False)
                 if isinstance(medias, list):
index 61d1ab20908bf1a233cf90624170d9ee824ff337..880c896875373bb13fb1ea9a373f87283c7e3a6b 100644 (file)
@@ -154,7 +154,7 @@ def _real_extract(self, url):
         # request basic data
         basic_data_params = {
             'vid': video_id,
-            'ccode': '0590',
+            'ccode': '0532',
             'client_ip': '192.168.1.1',
             'utid': cna,
             'client_ts': time.time() / 1000,
index 67321e6c777d323c7bca05b623c41758352a16a2..1130d10fa52174ef748717d9b46fc9ec475c35ec 100644 (file)
@@ -1248,6 +1248,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
             'only_matching': True,
         },
+        {
+            # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
+            'url': 'cBvYw8_A0vQ',
+            'info_dict': {
+                'id': 'cBvYw8_A0vQ',
+                'ext': 'mp4',
+                'title': '4K Ueno Okachimachi  Street  Scenes  上野御徒町歩き',
+                'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
+                'upload_date': '20201120',
+                'uploader': 'Walk around Japan',
+                'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -1817,7 +1834,13 @@ def _real_extract(self, url):
         def get_text(x):
             if not x:
                 return
-            return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
+            text = x.get('simpleText')
+            if text and isinstance(text, compat_str):
+                return text
+            runs = x.get('runs')
+            if not isinstance(runs, list):
+                return
+            return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
 
         search_meta = (
             lambda x: self._html_search_meta(x, webpage, default=None)) \