]> jfr.im git - yt-dlp.git/commitdiff
Update to ytdl-commit-cf2dbec
authorpukkandan <redacted>
Fri, 19 Feb 2021 20:44:36 +0000 (02:14 +0530)
committerpukkandan <redacted>
Fri, 19 Feb 2021 21:02:22 +0000 (02:32 +0530)
https://github.com/ytdl-org/youtube-dl/commit/cf2dbec6301177a1fddf72862de05fa912d9869d

Except: [kakao] improve info extraction and detect geo restriction
https://github.com/ytdl-org/youtube-dl/commit/d8085580f63ad3b146a31712ff76cf41d5a4558a

19 files changed:
test/test_youtube_lists.py
youtube_dlc/extractor/ard.py
youtube_dlc/extractor/canvas.py
youtube_dlc/extractor/ccma.py
youtube_dlc/extractor/dplay.py
youtube_dlc/extractor/dreisat.py [new file with mode: 0644]
youtube_dlc/extractor/extractors.py
youtube_dlc/extractor/generic.py
youtube_dlc/extractor/ninegag.py
youtube_dlc/extractor/simplecast.py [new file with mode: 0644]
youtube_dlc/extractor/storyfire.py
youtube_dlc/extractor/videopress.py
youtube_dlc/extractor/viki.py
youtube_dlc/extractor/vimeo.py
youtube_dlc/extractor/xboxclips.py
youtube_dlc/extractor/yandexmusic.py
youtube_dlc/extractor/youtube.py
youtube_dlc/extractor/zhihu.py [new file with mode: 0644]
youtube_dlc/postprocessor/embedthumbnail.py

index a693963ef9076e0a5bd5634b668e3fc7852acdd0..d9b8fa55005f8df66fdbe241338cbe11a8eaf4e9 100644 (file)
@@ -12,6 +12,7 @@
 
 from youtube_dlc.extractor import (
     YoutubePlaylistIE,
+    YoutubeTabIE,
     YoutubeIE,
 )
 
@@ -57,14 +58,22 @@ def test_youtube_toptracks(self):
         entries = result['entries']
         self.assertEqual(len(entries), 100)
 
-    def test_youtube_flat_playlist_titles(self):
+    def test_youtube_flat_playlist_extraction(self):
         dl = FakeYDL()
         dl.params['extract_flat'] = True
-        ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv')
+        ie = YoutubeTabIE(dl)
+        result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc')
         self.assertIsPlaylist(result)
-        for entry in result['entries']:
-            self.assertTrue(entry.get('title'))
+        entries = list(result['entries'])
+        self.assertTrue(len(entries) == 1)
+        video = entries[0]
+        self.assertEqual(video['_type'], 'url_transparent')
+        self.assertEqual(video['ie_key'], 'Youtube')
+        self.assertEqual(video['id'], 'BaW_jenozKc')
+        self.assertEqual(video['url'], 'BaW_jenozKc')
+        self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
+        self.assertEqual(video['duration'], 10)
+        self.assertEqual(video['uploader'], 'Philipp Hagemeister')
 
 
 if __name__ == '__main__':
index 73379314523bb0dfaf2f7f828236a13f7b1824f8..12a7cfb54a1fc891ce0e2d8565ddca8d511fbb31 100644 (file)
@@ -324,20 +324,42 @@ def _real_extract(self, url):
 
         formats = []
         for a in video_node.findall('.//asset'):
+            file_name = xpath_text(a, './fileName', default=None)
+            if not file_name:
+                continue
+            format_type = a.attrib.get('type')
+            format_url = url_or_none(file_name)
+            if format_url:
+                ext = determine_ext(file_name)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, display_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id=format_type or 'hls', fatal=False))
+                    continue
+                elif ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        update_url_query(format_url, {'hdcore': '3.7.0'}),
+                        display_id, f4m_id=format_type or 'hds', fatal=False))
+                    continue
             f = {
-                'format_id': a.attrib['type'],
-                'width': int_or_none(a.find('./frameWidth').text),
-                'height': int_or_none(a.find('./frameHeight').text),
-                'vbr': int_or_none(a.find('./bitrateVideo').text),
-                'abr': int_or_none(a.find('./bitrateAudio').text),
-                'vcodec': a.find('./codecVideo').text,
-                'tbr': int_or_none(a.find('./totalBitrate').text),
+                'format_id': format_type,
+                'width': int_or_none(xpath_text(a, './frameWidth')),
+                'height': int_or_none(xpath_text(a, './frameHeight')),
+                'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
+                'abr': int_or_none(xpath_text(a, './bitrateAudio')),
+                'vcodec': xpath_text(a, './codecVideo'),
+                'tbr': int_or_none(xpath_text(a, './totalBitrate')),
             }
-            if a.find('./serverPrefix').text:
-                f['url'] = a.find('./serverPrefix').text
-                f['playpath'] = a.find('./fileName').text
+            server_prefix = xpath_text(a, './serverPrefix', default=None)
+            if server_prefix:
+                f.update({
+                    'url': server_prefix,
+                    'playpath': file_name,
+                })
             else:
-                f['url'] = a.find('./fileName').text
+                if not format_url:
+                    continue
+                f['url'] = format_url
             formats.append(f)
         self._sort_formats(formats)
 
index 8b76a0200ca51a350052d7bcc2b4beff3fee2448..eefbab241b9df656265fac8ba4206833f9cf8047 100644 (file)
@@ -7,19 +7,21 @@
 from .gigya import GigyaBaseIE
 from ..compat import compat_HTTPError
 from ..utils import (
-    extract_attributes,
     ExtractorError,
-    strip_or_none,
+    clean_html,
+    extract_attributes,
     float_or_none,
+    get_element_by_class,
     int_or_none,
     merge_dicts,
     str_or_none,
+    strip_or_none,
     url_or_none,
 )
 
 
 class CanvasIE(InfoExtractor):
-    _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
     _TESTS = [{
         'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
         'md5': '68993eda72ef62386a15ea2cf3c93107',
@@ -332,3 +334,51 @@ def _real_extract(self, url):
             'display_id': display_id,
             'season_number': int_or_none(page.get('episode_season')),
         })
+
+
+class DagelijkseKostIE(InfoExtractor):
+    IE_DESC = 'dagelijksekost.een.be'
+    _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
+        'md5': '30bfffc323009a3e5f689bef6efa2365',
+        'info_dict': {
+            'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
+            'display_id': 'hachis-parmentier-met-witloof',
+            'ext': 'mp4',
+            'title': 'Hachis parmentier met witloof',
+            'description': 'md5:9960478392d87f63567b5b117688cdc5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 283.02,
+        },
+        'expected_warnings': ['is not a supported codec'],
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        title = strip_or_none(get_element_by_class(
+            'dish-metadata__title', webpage
+        ) or self._html_search_meta(
+            'twitter:title', webpage))
+
+        description = clean_html(get_element_by_class(
+            'dish-description', webpage)
+        ) or self._html_search_meta(
+            ('description', 'twitter:description', 'og:description'),
+            webpage)
+
+        video_id = self._html_search_regex(
+            r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
+            group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
+            'ie_key': CanvasIE.ie_key(),
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+        }
index 4db51e650680812573e7c3179fb7a1f046764c2a..e6ae49352132f81b1afbced0d71fe35c668c29f0 100644 (file)
@@ -1,12 +1,14 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import calendar
 import datetime
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     clean_html,
+    extract_timezone,
     int_or_none,
     parse_duration,
     parse_resolution,
@@ -97,8 +99,9 @@ def _real_extract(self, url):
         timestamp = None
         data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
         try:
-            timestamp = datetime.datetime.strptime(
-                data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp()
+            timezone, data_utc = extract_timezone(data_utc)
+            timestamp = calendar.timegm((datetime.datetime.strptime(
+                data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
         except TypeError:
             pass
 
index 47501dbe6140ea8f30d0ce532bfc3bd321f35557..0f0632f26942ead676f59803fe1d4cf37161357f 100644 (file)
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
     ExtractorError,
     float_or_none,
     int_or_none,
+    strip_or_none,
     unified_timestamp,
 )
 
 
 class DPlayIE(InfoExtractor):
+    _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
     _VALID_URL = r'''(?x)https?://
         (?P<domain>
             (?:www\.)?(?P<host>d
@@ -24,7 +27,7 @@ class DPlayIE(InfoExtractor):
                 )
             )|
             (?P<subdomain_country>es|it)\.dplay\.com
-        )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
+        )/[^/]+''' + _PATH_REGEX
 
     _TESTS = [{
         # non geo restricted, via secure api, unsigned download hls URL
@@ -151,56 +154,79 @@ class DPlayIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    def _process_errors(self, e, geo_countries):
+        info = self._parse_json(e.cause.read().decode('utf-8'), None)
+        error = info['errors'][0]
+        error_code = error.get('code')
+        if error_code == 'access.denied.geoblocked':
+            self.raise_geo_restricted(countries=geo_countries)
+        elif error_code in ('access.denied.missingpackage', 'invalid.token'):
+            raise ExtractorError(
+                'This video is only available for registered users. You may want to use --cookies.', expected=True)
+        raise ExtractorError(info['errors'][0]['detail'], expected=True)
+
+    def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+        headers['Authorization'] = 'Bearer ' + self._download_json(
+            disco_base + 'token', display_id, 'Downloading token',
+            query={
+                'realm': realm,
+            })['data']['attributes']['token']
+
+    def _download_video_playback_info(self, disco_base, video_id, headers):
+        streaming = self._download_json(
+            disco_base + 'playback/videoPlaybackInfo/' + video_id,
+            video_id, headers=headers)['data']['attributes']['streaming']
+        streaming_list = []
+        for format_id, format_dict in streaming.items():
+            streaming_list.append({
+                'type': format_id,
+                'url': format_dict.get('url'),
+            })
+        return streaming_list
+
     def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
         geo_countries = [country.upper()]
         self._initialize_geo_bypass({
             'countries': geo_countries,
         })
         disco_base = 'https://%s/' % disco_host
-        token = self._download_json(
-            disco_base + 'token', display_id, 'Downloading token',
-            query={
-                'realm': realm,
-            })['data']['attributes']['token']
         headers = {
             'Referer': url,
-            'Authorization': 'Bearer ' + token,
         }
-        video = self._download_json(
-            disco_base + 'content/videos/' + display_id, display_id,
-            headers=headers, query={
-                'fields[channel]': 'name',
-                'fields[image]': 'height,src,width',
-                'fields[show]': 'name',
-                'fields[tag]': 'name',
-                'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
-                'include': 'images,primaryChannel,show,tags'
-            })
+        self._update_disco_api_headers(headers, disco_base, display_id, realm)
+        try:
+            video = self._download_json(
+                disco_base + 'content/videos/' + display_id, display_id,
+                headers=headers, query={
+                    'fields[channel]': 'name',
+                    'fields[image]': 'height,src,width',
+                    'fields[show]': 'name',
+                    'fields[tag]': 'name',
+                    'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+                    'include': 'images,primaryChannel,show,tags'
+                })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                self._process_errors(e, geo_countries)
+            raise
         video_id = video['data']['id']
         info = video['data']['attributes']
         title = info['name'].strip()
         formats = []
         try:
-            streaming = self._download_json(
-                disco_base + 'playback/videoPlaybackInfo/' + video_id,
-                display_id, headers=headers)['data']['attributes']['streaming']
+            streaming = self._download_video_playback_info(
+                disco_base, video_id, headers)
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
-                info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
-                error = info['errors'][0]
-                error_code = error.get('code')
-                if error_code == 'access.denied.geoblocked':
-                    self.raise_geo_restricted(countries=geo_countries)
-                elif error_code == 'access.denied.missingpackage':
-                    self.raise_login_required()
-                raise ExtractorError(info['errors'][0]['detail'], expected=True)
+                self._process_errors(e, geo_countries)
             raise
-        for format_id, format_dict in streaming.items():
+        for format_dict in streaming:
             if not isinstance(format_dict, dict):
                 continue
             format_url = format_dict.get('url')
             if not format_url:
                 continue
+            format_id = format_dict.get('type')
             ext = determine_ext(format_url)
             if format_id == 'dash' or ext == 'mpd':
                 formats.extend(self._extract_mpd_formats(
@@ -248,7 +274,7 @@ def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
             'id': video_id,
             'display_id': display_id,
             'title': title,
-            'description': info.get('description'),
+            'description': strip_or_none(info.get('description')),
             'duration': float_or_none(info.get('videoDuration'), 1000),
             'timestamp': unified_timestamp(info.get('publishStart')),
             'series': series,
@@ -268,3 +294,75 @@ def _real_extract(self, url):
         host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
         return self._get_disco_api_info(
             url, display_id, host, 'dplay' + country, country)
+
+
+class DiscoveryPlusIE(DPlayIE):
+    _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
+    _TESTS = [{
+        'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
+        'info_dict': {
+            'id': '1140794',
+            'display_id': 'property-brothers-forever-home/food-and-family',
+            'ext': 'mp4',
+            'title': 'Food and Family',
+            'description': 'The brothers help a Richmond family expand their single-level home.',
+            'duration': 2583.113,
+            'timestamp': 1609304400,
+            'upload_date': '20201230',
+            'creator': 'HGTV',
+            'series': 'Property Brothers: Forever Home',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'skip': 'Available for Premium users',
+    }]
+
+    def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+        headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
+
+    def _download_video_playback_info(self, disco_base, video_id, headers):
+        return self._download_json(
+            disco_base + 'playback/v3/videoPlaybackInfo',
+            video_id, headers=headers, data=json.dumps({
+                'deviceInfo': {
+                    'adBlocker': False,
+                },
+                'videoId': video_id,
+                'wisteriaProperties': {
+                    'platform': 'desktop',
+                },
+            }).encode('utf-8'))['data']['attributes']['streaming']
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        return self._get_disco_api_info(
+            url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
+
+
+class HGTVDeIE(DPlayIE):
+    _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+    _TESTS = [{
+        'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
+        'info_dict': {
+            'id': '151205',
+            'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
+            'ext': 'mp4',
+            'title': 'Wer braucht schon eine Toilette',
+            'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
+            'duration': 1177.024,
+            'timestamp': 1595705400,
+            'upload_date': '20200725',
+            'creator': 'HGTV',
+            'series': 'Tiny House - klein, aber oho',
+            'season_number': 3,
+            'episode_number': 3,
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        return self._get_disco_api_info(
+            url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
diff --git a/youtube_dlc/extractor/dreisat.py b/youtube_dlc/extractor/dreisat.py
new file mode 100644 (file)
index 0000000..848d387
--- /dev/null
@@ -0,0 +1,193 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+    xpath_text,
+    determine_ext,
+    float_or_none,
+    ExtractorError,
+)
+
+
+class DreiSatIE(InfoExtractor):
+    IE_NAME = '3sat'
+    _GEO_COUNTRIES = ['DE']
+    _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
+            'md5': 'be37228896d30a88f315b638900a026e',
+            'info_dict': {
+                'id': '45918',
+                'ext': 'mp4',
+                'title': 'Waidmannsheil',
+                'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
+                'uploader': 'SCHWEIZWEIT',
+                'uploader_id': '100000210',
+                'upload_date': '20140913'
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
+        },
+        {
+            'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
+            'only_matching': True,
+        },
+    ]
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        param_groups = {}
+        for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
+            group_id = param_group.get(self._xpath_ns(
+                'id', 'http://www.w3.org/XML/1998/namespace'))
+            params = {}
+            for param in param_group:
+                params[param.get('name')] = param.get('value')
+            param_groups[group_id] = params
+
+        formats = []
+        for video in smil.findall(self._xpath_ns('.//video', namespace)):
+            src = video.get('src')
+            if not src:
+                continue
+            bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            group_id = video.get('paramGroup')
+            param_group = param_groups[group_id]
+            for proto in param_group['protocols'].split(','):
+                formats.append({
+                    'url': '%s://%s' % (proto, param_group['host']),
+                    'app': param_group['app'],
+                    'play_path': src,
+                    'ext': 'flv',
+                    'format_id': '%s-%d' % (proto, bitrate),
+                    'tbr': bitrate,
+                })
+        self._sort_formats(formats)
+        return formats
+
+    def extract_from_xml_url(self, video_id, xml_url):
+        doc = self._download_xml(
+            xml_url, video_id,
+            note='Downloading video info',
+            errnote='Failed to download video info')
+
+        status_code = xpath_text(doc, './status/statuscode')
+        if status_code and status_code != 'ok':
+            if status_code == 'notVisibleAnymore':
+                message = 'Video %s is not available' % video_id
+            else:
+                message = '%s returned error: %s' % (self.IE_NAME, status_code)
+            raise ExtractorError(message, expected=True)
+
+        title = xpath_text(doc, './/information/title', 'title', True)
+
+        urls = []
+        formats = []
+        for fnode in doc.findall('.//formitaeten/formitaet'):
+            video_url = xpath_text(fnode, 'url')
+            if not video_url or video_url in urls:
+                continue
+            urls.append(video_url)
+
+            is_available = 'http://www.metafilegenerator' not in video_url
+            geoloced = 'static_geoloced_online' in video_url
+            if not is_available or geoloced:
+                continue
+
+            format_id = fnode.attrib['basetype']
+            format_m = re.match(r'''(?x)
+                (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+                (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+            ''', format_id)
+
+            ext = determine_ext(video_url, None) or format_m.group('container')
+
+            if ext == 'meta':
+                continue
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    video_url, video_id, fatal=False))
+            elif ext == 'm3u8':
+                # the certificates are misconfigured (see
+                # https://github.com/ytdl-org/youtube-dl/issues/8665)
+                if video_url.startswith('https://'):
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id=format_id, fatal=False))
+            else:
+                quality = xpath_text(fnode, './quality')
+                if quality:
+                    format_id += '-' + quality
+
+                abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
+                vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
+
+                tbr = int_or_none(self._search_regex(
+                    r'_(\d+)k', video_url, 'bitrate', None))
+                if tbr and vbr and not abr:
+                    abr = tbr - vbr
+
+                formats.append({
+                    'format_id': format_id,
+                    'url': video_url,
+                    'ext': ext,
+                    'acodec': format_m.group('acodec'),
+                    'vcodec': format_m.group('vcodec'),
+                    'abr': abr,
+                    'vbr': vbr,
+                    'tbr': tbr,
+                    'width': int_or_none(xpath_text(fnode, './width')),
+                    'height': int_or_none(xpath_text(fnode, './height')),
+                    'filesize': int_or_none(xpath_text(fnode, './filesize')),
+                    'protocol': format_m.group('proto').lower(),
+                })
+
+        geolocation = xpath_text(doc, './/details/geolocation')
+        if not formats and geolocation and geolocation != 'none':
+            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for node in doc.findall('.//teaserimages/teaserimage'):
+            thumbnail_url = node.text
+            if not thumbnail_url:
+                continue
+            thumbnail = {
+                'url': thumbnail_url,
+            }
+            thumbnail_key = node.get('key')
+            if thumbnail_key:
+                m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
+                if m:
+                    thumbnail['width'] = int(m.group(1))
+                    thumbnail['height'] = int(m.group(2))
+            thumbnails.append(thumbnail)
+
+        upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': xpath_text(doc, './/information/detail'),
+            'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
+            'thumbnails': thumbnails,
+            'uploader': xpath_text(doc, './/details/originChannelTitle'),
+            'uploader_id': xpath_text(doc, './/details/originChannelId'),
+            'upload_date': upload_date,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
+        return self.extract_from_xml_url(video_id, details_url)
index cbbc8f7cd64f480d5a431bd5a9ffab06e8026945..8c6f96bd1c62c335c4a96b86083cde0a4d0b412a 100644 (file)
     CanvasIE,
     CanvasEenIE,
     VrtNUIE,
+    DagelijkseKostIE,
 )
 from .carambatv import (
     CarambaTVIE,
     DouyuShowIE,
     DouyuTVIE,
 )
-from .dplay import DPlayIE
+from .dplay import (
+    DPlayIE,
+    DiscoveryPlusIE,
+    HGTVDeIE,
+)
+from .dreisat import DreiSatIE
 from .drbonanza import DRBonanzaIE
 from .drtuber import DrTuberIE
 from .drtv import (
     VivoIE,
 )
 from .showroomlive import ShowRoomLiveIE
+from .simplecast import (
+    SimplecastIE,
+    SimplecastEpisodeIE,
+    SimplecastPodcastIE,
+)
 from .sina import SinaIE
 from .sixplay import SixPlayIE
 from .skyit import (
     BellatorIE,
     ParamountNetworkIE,
 )
-from .storyfire import (
-    StoryFireIE,
-    StoryFireUserIE,
-    StoryFireSeriesIE,
-)
 from .stitcher import StitcherIE
 from .sport5 import Sport5IE
 from .sportbox import SportBoxIE
 from .srmediathek import SRMediathekIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .steam import SteamIE
+from .storyfire import (
+    StoryFireIE,
+    StoryFireUserIE,
+    StoryFireSeriesIE,
+)
 from .streamable import StreamableIE
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
     ZattooLiveIE,
 )
 from .zdf import ZDFIE, ZDFChannelIE
+from .zhihu import ZhihuIE
 from .zingmp3 import ZingMp3IE
 from .zoom import ZoomIE
 from .zype import ZypeIE
index 819ba46a8a0d0d7a2f3a3b0cd24ab22150e92c0c..8cde11d2b8d8ba8ff87e9289d4aa1fec822b81d0 100644 (file)
 from .rumble import RumbleEmbedIE
 from .arcpublishing import ArcPublishingIE
 from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
 
 
 class GenericIE(InfoExtractor):
@@ -2240,6 +2241,15 @@ class GenericIE(InfoExtractor):
                 'duration': 159,
             },
         },
+        {
+            # Simplecast player embed
+            'url': 'https://www.bio.org/podcast',
+            'info_dict': {
+                'id': 'podcast',
+                'title': 'I AM BIO Podcast | BIO',
+            },
+            'playlist_mincount': 52,
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -2794,6 +2804,12 @@ def _real_extract(self, url):
             return self.playlist_from_matches(
                 matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
 
+        # Look for Simplecast embeds
+        simplecast_urls = SimplecastIE._extract_urls(webpage)
+        if simplecast_urls:
+            return self.playlist_from_matches(
+                simplecast_urls, video_id, video_title)
+
         # Look for BBC iPlayer embed
         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
         if matches:
index 440f865bcee89c7c70f7bde7e67966af3bd2f44d..14390823bcf8e3d6ecb7738f134f302c25266338 100644 (file)
@@ -2,10 +2,11 @@
 
 from .common import InfoExtractor
 from ..utils import (
-    determine_ext,
     ExtractorError,
+    determine_ext,
     int_or_none,
     try_get,
+    unescapeHTML,
     url_or_none,
 )
 
@@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor):
     IE_NAME = '9gag'
     _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'https://9gag.com/gag/ae5Ag7B',
         'info_dict': {
             'id': 'ae5Ag7B',
@@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor):
             'dislike_count': int,
             'comment_count': int,
         }
-    }
+    }, {
+        # HTML escaped title
+        'url': 'https://9gag.com/gag/av5nvyb',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         post_id = self._match_id(url)
@@ -43,7 +48,7 @@ def _real_extract(self, url):
                 'The given url does not contain a video',
                 expected=True)
 
-        title = post['title']
+        title = unescapeHTML(post['title'])
 
         duration = None
         formats = []
diff --git a/youtube_dlc/extractor/simplecast.py b/youtube_dlc/extractor/simplecast.py
new file mode 100644 (file)
index 0000000..2d0b3c0
--- /dev/null
@@ -0,0 +1,160 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_podcast_url,
+    int_or_none,
+    parse_iso8601,
+    strip_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class SimplecastBaseIE(InfoExtractor):
+    _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
+    _API_BASE = 'https://api.simplecast.com/'
+
+    def _call_api(self, path_tmpl, video_id):
+        return self._download_json(
+            self._API_BASE + path_tmpl % video_id, video_id)
+
+    def _call_search_api(self, resource, resource_id, resource_url):
+        return self._download_json(
+            'https://api.simplecast.com/%ss/search' % resource, resource_id,
+            data=urlencode_postdata({'url': resource_url}))
+
+    def _parse_episode(self, episode):
+        episode_id = episode['id']
+        title = episode['title'].strip()
+        audio_file = episode.get('audio_file') or {}
+        audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
+
+        season = episode.get('season') or {}
+        season_href = season.get('href')
+        season_id = None
+        if season_href:
+            season_id = self._search_regex(
+                r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX,
+                season_href, 'season id', default=None)
+
+        webpage_url = episode.get('episode_url')
+        channel_url = None
+        if webpage_url:
+            channel_url = self._search_regex(
+                r'(https?://[^/]+\.simplecast\.com)',
+                webpage_url, 'channel url', default=None)
+
+        return {
+            'id': episode_id,
+            'display_id': episode.get('slug'),
+            'title': title,
+            'url': clean_podcast_url(audio_file_url),
+            'webpage_url': webpage_url,
+            'channel_url': channel_url,
+            'series': try_get(episode, lambda x: x['podcast']['title']),
+            'season_number': int_or_none(season.get('number')),
+            'season_id': season_id,
+            'thumbnail': episode.get('image_url'),
+            'episode_id': episode_id,
+            'episode_number': int_or_none(episode.get('number')),
+            'description': strip_or_none(episode.get('description')),
+            'timestamp': parse_iso8601(episode.get('published_at')),
+            'duration': int_or_none(episode.get('duration')),
+            'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
+        }
+
+
+class SimplecastIE(SimplecastBaseIE):
+    IE_NAME = 'simplecast'
+    _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
+    _COMMON_TEST_INFO = {
+        'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+        'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'ext': 'mp3',
+        'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+        'episode_number': 1,
+        'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
+        'season_number': 1,
+        'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+        'series': 'The RE:BIND.io Podcast',
+        'duration': 5343,
+        'timestamp': 1580979475,
+        'upload_date': '20200206',
+        'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+        'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
+    }
+    _TESTS = [{
+        'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'md5': '8c93be7be54251bf29ee97464eabe61c',
+        'info_dict': _COMMON_TEST_INFO,
+    }, {
+        'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'''(?x)<iframe[^>]+src=["\']
+                (
+                    https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
+                    player\.simplecast\.com/%s
+                ))''' % SimplecastBaseIE._UUID_REGEX, webpage)
+
+    def _real_extract(self, url):
+        episode_id = self._match_id(url)
+        episode = self._call_api('episodes/%s', episode_id)
+        return self._parse_episode(episode)
+
+
+class SimplecastEpisodeIE(SimplecastBaseIE):
+    IE_NAME = 'simplecast:episode'
+    _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+        'md5': '8c93be7be54251bf29ee97464eabe61c',
+        'info_dict': SimplecastIE._COMMON_TEST_INFO,
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        episode = self._call_search_api(
+            'episode', mobj.group(1), mobj.group(0))
+        return self._parse_episode(episode)
+
+
+class SimplecastPodcastIE(SimplecastBaseIE):
+    IE_NAME = 'simplecast:podcast'
+    _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://the-re-bind-io-podcast.simplecast.com',
+        'playlist_mincount': 33,
+        'info_dict': {
+            'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
+            'title': 'The RE:BIND.io Podcast',
+        },
+    }, {
+        'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        subdomain = self._match_id(url)
+        site = self._call_search_api('site', subdomain, url)
+        podcast = site['podcast']
+        podcast_id = podcast['id']
+        podcast_title = podcast.get('title')
+
+        def entries():
+            episodes = self._call_api('podcasts/%s/episodes', podcast_id)
+            for episode in (episodes.get('collection') or []):
+                info = self._parse_episode(episode)
+                info['series'] = podcast_title
+                yield info
+
+        return self.playlist_result(entries(), podcast_id, podcast_title)
index 19cb1ff9e3866931e4ea76c83ba12a17d2478704..9c698626fc68ddd24e6295e8a6c0fded011d40e1 100644 (file)
 # coding: utf-8
 from __future__ import unicode_literals
 
-import itertools
+import functools
+
 from .common import InfoExtractor
+from ..utils import (
+    # HEADRequest,
+    int_or_none,
+    OnDemandPagedList,
+    smuggle_url,
+)
+
+
+class StoryFireBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/'
+
+    def _call_api(self, path, video_id, resource, query=None):
+        return self._download_json(
+            'https://storyfire.com/app/%s/%s' % (path, video_id), video_id,
+            'Downloading %s JSON metadata' % resource, query=query)
+
+    def _parse_video(self, video):
+        title = video['title']
+        vimeo_id = self._search_regex(
+            r'https?://player\.vimeo\.com/external/(\d+)',
+            video['vimeoVideoURL'], 'vimeo id')
+
+        # video_url = self._request_webpage(
+        #    HEADRequest(video['vimeoVideoURL']), video_id).geturl()
+        # formats = []
+        # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
+        #    formats.extend(self._extract_m3u8_formats(
+        #        v_url, video_id, 'mp4', 'm3u8_native',
+        #        m3u8_id='hls' + suffix, fatal=False))
+        #    formats.extend(self._extract_mpd_formats(
+        #        v_url.replace('.m3u8', '.mpd'), video_id,
+        #        mpd_id='dash' + suffix, fatal=False))
+        # self._sort_formats(formats)
 
+        uploader_id = video.get('hostID')
 
-class StoryFireIE(InfoExtractor):
-    _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)'
-    _TESTS = [{
+        return {
+            '_type': 'url_transparent',
+            'id': vimeo_id,
+            'title': title,
+            'description': video.get('description'),
+            'url': smuggle_url(
+                'https://player.vimeo.com/video/' + vimeo_id, {
+                    'http_headers': {
+                        'Referer': 'https://storyfire.com/',
+                    }
+                }),
+            # 'formats': formats,
+            'thumbnail': video.get('storyImage'),
+            'view_count': int_or_none(video.get('views')),
+            'like_count': int_or_none(video.get('likesCount')),
+            'comment_count': int_or_none(video.get('commentsCount')),
+            'duration': int_or_none(video.get('videoDuration')),
+            'timestamp': int_or_none(video.get('publishDate')),
+            'uploader': video.get('username'),
+            'uploader_id': uploader_id,
+            'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
+            'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
+        }
+
+
+class StoryFireIE(StoryFireBaseIE):
+    _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})'
+    _TEST = {
         'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
-        'md5': '560953bfca81a69003cfa5e53ac8a920',
+        'md5': 'caec54b9e4621186d6079c7ec100c1eb',
         'info_dict': {
-            'id': '5df1d132b6378700117f9181',
+            'id': '378954662',
             'ext': 'mp4',
             'title': 'Buzzfeed Teaches You About Memes',
             'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
             'timestamp': 1576129028,
-            'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies',
+            'description': 'md5:0b4e28021548e144bed69bb7539e62ea',
             'uploader': 'whang!',
             'upload_date': '20191212',
+            'duration': 418,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
         },
-        'params': {'format': 'bestvideo'}  # There are no merged formats in the playlist.
-    }, {
-        'url': 'https://storyfire.app.link/5GxAvWOQr8',  # Alternate URL format, with unrelated short ID
-        'md5': '7a2dc6d60c4889edfed459c620fe690d',
-        'info_dict': {
-            'id': '5f1e11ecd78a57b6c702001d',
-            'ext': 'm4a',
-            'title': 'Weird Nintendo Prototype Leaks',
-            'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
-            'timestamp': 1595808576,
-            'upload_date': '20200727',
-            'uploader': 'whang!',
-            'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+        'params': {
+            'skip_download': True,
         },
-        'params': {'format': 'bestaudio'}  # Verifying audio extraction
-
-    }]
-
-    _aformats = {
-        'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
-        'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
+        'expected_warnings': ['Unable to download JSON metadata']
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        # Extracting the json blob is mandatory to proceed with extraction.
-        jsontext = self._html_search_regex(
-            r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
-            webpage, 'json_data')
-
-        json = self._parse_json(jsontext, video_id)
-
-        # The currentVideo field in the json is mandatory
-        # because it contains the only link to the m3u playlist
-        video = json['props']['initialState']['video']['currentVideo']
-        videourl = video['vimeoVideoURL']  # Video URL is mandatory
-
-        # Extract other fields from the json in an error tolerant fashion
-        # ID may be incorrect (on short URL format), correct it.
-        parsed_id = video.get('_id')
-        if parsed_id:
-            video_id = parsed_id
+        video = self._call_api(
+            'generic/video-detail', video_id, 'video')['video']
+        return self._parse_video(video)
 
-        title = video.get('title')
-        description = video.get('description')
 
-        thumbnail = video.get('storyImage')
-        views = video.get('views')
-        likes = video.get('likesCount')
-        comments = video.get('commentsCount')
-        duration = video.get('videoDuration')
-        publishdate = video.get('publishDate')  # Apparently epoch time, day only
-
-        uploader = video.get('username')
-        uploader_id = video.get('hostID')
-        # Construct an uploader URL
-        uploader_url = None
-        if uploader_id:
-            uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
-
-        # Collect root playlist to determine formats
-        formats = self._extract_m3u8_formats(
-            videourl, video_id, 'mp4', 'm3u8_native')
-
-        # Modify formats to fill in missing information about audio codecs
-        for format in formats:
-            aformat = self._aformats.get(format['format_id'])
-            if aformat:
-                format['acodec'] = aformat['acodec']
-                format['abr'] = aformat['abr']
-                format['quality'] = aformat['preference']
-                format['ext'] = 'm4a'
-
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'ext': "mp4",
-            'url': videourl,
-            'formats': formats,
-
-            'thumbnail': thumbnail,
-            'view_count': views,
-            'like_count': likes,
-            'comment_count': comments,
-            'duration': duration,
-            'timestamp': publishdate,
-
-            'uploader': uploader,
-            'uploader_id': uploader_id,
-            'uploader_url': uploader_url,
-
-        }
-
-
-class StoryFireUserIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video'
-    _TESTS = [{
-        'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
-        'info_dict': {
-            'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
-            'title': 'whang!',
-        },
-        'playlist_mincount': 18
-    }, {
+class StoryFireUserIE(StoryFireBaseIE):
+    _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video'
+    _TEST = {
         'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
         'info_dict': {
             'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
-            'title': 'McJuggerNuggets',
         },
-        'playlist_mincount': 143
-
-    }]
+        'playlist_mincount': 151,
+    }
+    _PAGE_SIZE = 20
 
-    # Generator for fetching playlist items
-    def _enum_videos(self, baseurl, user_id, firstjson):
-        totalVideos = int(firstjson['videosCount'])
-        haveVideos = 0
-        json = firstjson
-
-        for page in itertools.count(1):
-            for video in json['videos']:
-                id = video['_id']
-                url = "https://storyfire.com/video-details/%s" % id
-                haveVideos += 1
-                yield {
-                    '_type': 'url',
-                    'id': id,
-                    'url': url,
-                    'ie_key': 'StoryFire',
-
-                    'title': video.get('title'),
-                    'description': video.get('description'),
-                    'view_count': video.get('views'),
-                    'comment_count': video.get('commentsCount'),
-                    'duration': video.get('videoDuration'),
-                    'timestamp': video.get('publishDate'),
-                }
-            # Are there more pages we could fetch?
-            if haveVideos < totalVideos:
-                pageurl = baseurl + ("%i" % haveVideos)
-                json = self._download_json(pageurl, user_id,
-                                           note='Downloading page %s' % page)
-
-                # Are there any videos in the new json?
-                videos = json.get('videos')
-                if not videos or len(videos) == 0:
-                    break  # no videos
-
-            else:
-                break  # We have fetched all the videos, stop
+    def _fetch_page(self, user_id, page):
+        videos = self._call_api(
+            'publicVideos', user_id, 'page %d' % (page + 1), {
+                'skip': page * self._PAGE_SIZE,
+            })['videos']
+        for video in videos:
+            yield self._parse_video(video)
 
     def _real_extract(self, url):
         user_id = self._match_id(url)
+        entries = OnDemandPagedList(functools.partial(
+            self._fetch_page, user_id), self._PAGE_SIZE)
+        return self.playlist_result(entries, user_id)
 
-        baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id
-
-        # Download first page to ensure it can be downloaded, and get user information if available.
-        firstpage = baseurl + "0"
-        firstjson = self._download_json(firstpage, user_id)
-
-        title = None
-        videos = firstjson.get('videos')
-        if videos and len(videos):
-            title = videos[1].get('username')
 
-        return {
-            '_type': 'playlist',
-            'entries': self._enum_videos(baseurl, user_id, firstjson),
-            'id': user_id,
-            'title': title,
-        }
-
-
-class StoryFireSeriesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)'
+class StoryFireSeriesIE(StoryFireBaseIE):
+    _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)'
     _TESTS = [{
         'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
         'info_dict': {
             'id': '-Lq6MsuIHLODO6d2dDkr',
         },
-        'playlist_mincount': 13
+        'playlist_mincount': 13,
     }, {
         'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
         'info_dict': {
             'id': 'the_mortal_one',
         },
-        'playlist_count': 0  # This playlist has entries, but no videos.
-    }, {
-        'url': 'https://storyfire.com/write/series/stories/story_time',
-        'info_dict': {
-            'id': 'story_time',
-        },
-        'playlist_mincount': 10
+        'playlist_count': 0,
     }]
 
-    # Generator for returning playlist items
-    # This object is substantially different than the one in the user videos page above
-    def _enum_videos(self, jsonlist):
-        for video in jsonlist:
-            id = video['_id']
-            if video.get('hasVideo'):  # Boolean element
-                url = "https://storyfire.com/video-details/%s" % id
-                yield {
-                    '_type': 'url',
-                    'id': id,
-                    'url': url,
-                    'ie_key': 'StoryFire',
-
-                    'title': video.get('title'),
-                    'description': video.get('description'),
-                    'view_count': video.get('views'),
-                    'likes_count': video.get('likesCount'),
-                    'comment_count': video.get('commentsCount'),
-                    'duration': video.get('videoDuration'),
-                    'timestamp': video.get('publishDate'),
-                }
+    def _extract_videos(self, stories):
+        for story in stories.values():
+            if story.get('hasVideo'):
+                yield self._parse_video(story)
 
     def _real_extract(self, url):
-        list_id = self._match_id(url)
-
-        listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id
-        json = self._download_json(listurl, list_id)
-
-        return {
-            '_type': 'playlist',
-            'entries': self._enum_videos(json),
-            'id': list_id
-        }
+        series_id = self._match_id(url)
+        stories = self._call_api(
+            'seriesStories', series_id, 'series stories')
+        return self.playlist_result(self._extract_videos(stories), series_id)
index e5f964d39abaa06ea0400bb68214a5ed7cab42f2..6376ff09613f354690cac14759734c1c14fd4f06 100644 (file)
@@ -4,21 +4,22 @@
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     determine_ext,
     float_or_none,
+    int_or_none,
     parse_age_limit,
     qualities,
     random_birthday,
-    try_get,
     unified_timestamp,
     urljoin,
 )
 
 
 class VideoPressIE(InfoExtractor):
-    _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
+    _ID_REGEX = r'[\da-zA-Z]{8}'
+    _PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
+    _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
     _TESTS = [{
         'url': 'https://videopress.com/embed/kUJmAcSf',
         'md5': '706956a6c875873d51010921310e4bc6',
@@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor):
         # 17+, requires birth_* params
         'url': 'https://videopress.com/embed/iH3gstfZ',
         'only_matching': True,
+    }, {
+        'url': 'https://video.wordpress.com/embed/kUJmAcSf',
+        'only_matching': True,
     }]
 
     @staticmethod
     def _extract_urls(webpage):
         return re.findall(
-            r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
+            r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
             webpage)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         query = random_birthday('birth_year', 'birth_month', 'birth_day')
+        query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width'
         video = self._download_json(
             'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
             video_id, query=query)
 
         title = video['title']
 
-        def base_url(scheme):
-            return try_get(
-                video, lambda x: x['file_url_base'][scheme], compat_str)
-
-        base_url = base_url('https') or base_url('http')
+        file_url_base = video.get('file_url_base') or {}
+        base_url = file_url_base.get('https') or file_url_base.get('http')
 
         QUALITIES = ('std', 'dvd', 'hd')
         quality = qualities(QUALITIES)
 
         formats = []
-        for format_id, f in video['files'].items():
+        for format_id, f in (video.get('files') or {}).items():
             if not isinstance(f, dict):
                 continue
             for ext, path in f.items():
@@ -75,12 +77,14 @@ def base_url(scheme):
                         'ext': determine_ext(path, ext),
                         'quality': quality(format_id),
                     })
-        original_url = try_get(video, lambda x: x['original'], compat_str)
+        original_url = video.get('original')
         if original_url:
             formats.append({
                 'url': original_url,
                 'format_id': 'original',
                 'quality': len(QUALITIES),
+                'width': int_or_none(video.get('width')),
+                'height': int_or_none(video.get('height')),
             })
         self._sort_formats(formats)
 
index 48d244cd6fa2686501fa3ee4b148afea14f1d68e..d9731095ceb9f9b36cf8061c68a9736dcb16a61e 100644 (file)
@@ -22,6 +22,7 @@
     parse_iso8601,
     sanitized_Request,
     std_headers,
+    try_get,
 )
 
 
@@ -42,7 +43,7 @@ class VikiBaseIE(InfoExtractor):
     _ERRORS = {
         'geo': 'Sorry, this content is not available in your region.',
         'upcoming': 'Sorry, this content is not yet available.',
-        # 'paywall': 'paywall',
+        'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
     }
 
     def _prepare_call(self, path, timestamp=None, post_data=None):
@@ -94,11 +95,13 @@ def _raise_error(self, error):
             expected=True)
 
     def _check_errors(self, data):
-        for reason, status in data.get('blocking', {}).items():
+        for reason, status in (data.get('blocking') or {}).items():
             if status and reason in self._ERRORS:
                 message = self._ERRORS[reason]
                 if reason == 'geo':
                     self.raise_geo_restricted(msg=message)
+                elif reason == 'paywall':
+                    self.raise_login_required(message)
                 raise ExtractorError('%s said: %s' % (
                     self.IE_NAME, message), expected=True)
 
@@ -143,13 +146,19 @@ class VikiIE(VikiBaseIE):
         'info_dict': {
             'id': '1023585v',
             'ext': 'mp4',
-            'title': 'Heirs Episode 14',
-            'uploader': 'SBS',
-            'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+            'title': 'Heirs Episode 14',
+            'uploader': 'SBS Contents Hub',
+            'timestamp': 1385047627,
             'upload_date': '20131121',
             'age_limit': 13,
+            'duration': 3570,
+            'episode_number': 14,
+        },
+        'params': {
+            'format': 'bestvideo',
         },
         'skip': 'Blocked in the US',
+        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
     }, {
         # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@@ -165,7 +174,8 @@ class VikiIE(VikiBaseIE):
             'uploader': 'Arirang TV',
             'like_count': int,
             'age_limit': 0,
-        }
+        },
+        'skip': 'Sorry. There was an error loading this video',
     }, {
         'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
         'info_dict': {
@@ -183,7 +193,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # episode
         'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '94e0e34fd58f169f40c184f232356cfe',
+        'md5': '0a53dc252e6e690feccd756861495a8c',
         'info_dict': {
             'id': '44699v',
             'ext': 'mp4',
@@ -195,6 +205,10 @@ class VikiIE(VikiBaseIE):
             'uploader': 'group8',
             'like_count': int,
             'age_limit': 13,
+            'episode_number': 1,
+        },
+        'params': {
+            'format': 'bestvideo',
         },
         'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
     }, {
@@ -221,7 +235,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # non-English description
         'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': 'adf9e321a0ae5d0aace349efaaff7691',
+        'md5': '41faaba0de90483fb4848952af7c7d0d',
         'info_dict': {
             'id': '158036v',
             'ext': 'mp4',
@@ -232,6 +246,10 @@ class VikiIE(VikiBaseIE):
             'title': 'Love In Magic',
             'age_limit': 13,
         },
+        'params': {
+            'format': 'bestvideo',
+        },
+        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
     }]
 
     def _real_extract(self, url):
@@ -249,22 +267,19 @@ def _real_extract(self, url):
         self._check_errors(video)
 
         title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
+        episode_number = int_or_none(video.get('number'))
         if not title:
-            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
-            container_titles = video.get('container', {}).get('titles', {})
+            title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {}
             container_title = self.dict_selection(container_titles, 'en')
             title = '%s - %s' % (container_title, title)
 
         description = self.dict_selection(video.get('descriptions', {}), 'en')
 
-        duration = int_or_none(video.get('duration'))
-        timestamp = parse_iso8601(video.get('created_at'))
-        uploader = video.get('author')
-        like_count = int_or_none(video.get('likes', {}).get('count'))
-        age_limit = parse_age_limit(video.get('rating'))
+        like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
 
         thumbnails = []
-        for thumbnail_id, thumbnail in video.get('images', {}).items():
+        for thumbnail_id, thumbnail in (video.get('images') or {}).items():
             thumbnails.append({
                 'id': thumbnail_id,
                 'url': thumbnail.get('url'),
@@ -289,7 +304,7 @@ def _real_extract(self, url):
                 }]
         except AttributeError:
             # fall-back to the old way if there isn't a streamSubtitles attribute
-            for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
                 subtitles[subtitle_lang] = [{
                     'ext': subtitles_format,
                     'url': self._prepare_call(
@@ -300,13 +315,15 @@ def _real_extract(self, url):
             'id': video_id,
             'title': title,
             'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'uploader': uploader,
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('created_at')),
+            'uploader': video.get('author'),
+            'uploader_url': video.get('author_url'),
             'like_count': like_count,
-            'age_limit': age_limit,
+            'age_limit': parse_age_limit(video.get('rating')),
             'thumbnails': thumbnails,
             'subtitles': subtitles,
+            'episode_number': episode_number,
         }
 
         formats = []
@@ -400,7 +417,7 @@ class VikiChannelIE(VikiBaseIE):
         'info_dict': {
             'id': '50c',
             'title': 'Boys Over Flowers',
-            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+            'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
         },
         'playlist_mincount': 71,
     }, {
@@ -411,6 +428,7 @@ class VikiChannelIE(VikiBaseIE):
             'description': 'md5:05bf5471385aa8b21c18ad450e350525',
         },
         'playlist_count': 127,
+        'skip': 'Page not found',
     }, {
         'url': 'http://www.viki.com/news/24569c-showbiz-korea',
         'only_matching': True,
index bbb1024d91852ba2d58d00ae15c41fb368aba24a..ecfb5f0c5fbe9212f93f777910e8d0231a85dff8 100644 (file)
@@ -221,10 +221,12 @@ def _parse_config(self, config, video_id):
             'is_live': is_live,
         }
 
-    def _extract_original_format(self, url, video_id):
+    def _extract_original_format(self, url, video_id, unlisted_hash=None):
+        query = {'action': 'load_download_config'}
+        if unlisted_hash:
+            query['unlisted_hash'] = unlisted_hash
         download_data = self._download_json(
-            url, video_id, fatal=False,
-            query={'action': 'load_download_config'},
+            url, video_id, fatal=False, query=query,
             headers={'X-Requested-With': 'XMLHttpRequest'})
         if download_data:
             source_file = download_data.get('source_file')
@@ -504,6 +506,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
         {
             'url': 'https://vimeo.com/160743502/abd0e13fb4',
             'only_matching': True,
+        },
+        {
+            # requires passing unlisted_hash(a52724358e) to load_download_config request
+            'url': 'https://vimeo.com/392479337/a52724358e',
+            'only_matching': True,
         }
         # https://gettingthingsdone.com/workflowmap/
         # vimeo embed with check-password page protected by Referer header
@@ -668,7 +675,8 @@ def _real_extract(self, url):
             if config.get('view') == 4:
                 config = self._verify_player_video_password(redirect_url, video_id, headers)
 
-        vod = config.get('video', {}).get('vod', {})
+        video = config.get('video') or {}
+        vod = video.get('vod') or {}
 
         def is_rented():
             if '>You rented this title.<' in webpage:
@@ -728,7 +736,7 @@ def is_rented():
         formats = []
 
         source_format = self._extract_original_format(
-            'https://vimeo.com/' + video_id, video_id)
+            'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash'))
         if source_format:
             formats.append(source_format)
 
index d9c277bc3cb0221cd926c54a64f95bcec928bd3d..25f487e1ee03297fd5fcd36aaeae5fe6efb5ccc1 100644 (file)
@@ -1,40 +1,55 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
 from ..utils import (
     int_or_none,
+    month_by_abbreviation,
     parse_filesize,
-    unified_strdate,
 )
 
 
 class XboxClipsIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
+    _TESTS = [{
         'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
         'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
         'info_dict': {
             'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
             'ext': 'mp4',
-            'title': 'Iabdulelah playing Titanfall',
+            'title': 'iAbdulElah playing Titanfall',
             'filesize_approx': 26800000,
             'upload_date': '20140807',
             'duration': 56,
         }
-    }
+    }, {
+        'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
+        if '/video.php' in url:
+            qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+            url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
+
         webpage = self._download_webpage(url, video_id)
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
 
-        video_url = self._html_search_regex(
-            r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
-        title = self._html_search_regex(
-            r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
-        upload_date = unified_strdate(self._html_search_regex(
-            r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
+        title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
+        upload_date = None
+        mobj = re.search(
+            r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})',
+            webpage)
+        if mobj:
+            upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1))
         filesize = parse_filesize(self._html_search_regex(
             r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
         duration = int_or_none(self._html_search_regex(
@@ -42,12 +57,12 @@ def _real_extract(self, url):
         view_count = int_or_none(self._html_search_regex(
             r'>Views: (\d+)<', webpage, 'view count', fatal=False))
 
-        return {
+        info.update({
             'id': video_id,
-            'url': video_url,
             'title': title,
             'upload_date': upload_date,
             'filesize_approx': filesize,
             'duration': duration,
             'view_count': view_count,
-        }
+        })
+        return info
index 3cc13bc5ba889c2d56e9d182c4794dd3e5502fea..4bcbaa4dbb7b4614ad7411640c8f3f90364e53d4 100644 (file)
@@ -1,8 +1,9 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
 import hashlib
+import itertools
+import re
 
 from .common import InfoExtractor
 from ..compat import compat_str
@@ -209,17 +210,27 @@ def _extract_tracks(self, source, item_id, url, tld):
             missing_track_ids = [
                 track_id for track_id in track_ids
                 if track_id not in present_track_ids]
-            missing_tracks = self._call_api(
-                'track-entries', tld, url, item_id,
-                'Downloading missing tracks JSON', {
-                    'entries': ','.join(missing_track_ids),
-                    'lang': tld,
-                    'external-domain': 'music.yandex.%s' % tld,
-                    'overembed': 'false',
-                    'strict': 'true',
-                })
-            if missing_tracks:
-                tracks.extend(missing_tracks)
+            # Request missing tracks in chunks to avoid exceeding max HTTP header size,
+            # see https://github.com/ytdl-org/youtube-dl/issues/27355
+            _TRACKS_PER_CHUNK = 250
+            for chunk_num in itertools.count(0):
+                start = chunk_num * _TRACKS_PER_CHUNK
+                end = start + _TRACKS_PER_CHUNK
+                missing_track_ids_req = missing_track_ids[start:end]
+                assert missing_track_ids_req
+                missing_tracks = self._call_api(
+                    'track-entries', tld, url, item_id,
+                    'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), {
+                        'entries': ','.join(missing_track_ids_req),
+                        'lang': tld,
+                        'external-domain': 'music.yandex.%s' % tld,
+                        'overembed': 'false',
+                        'strict': 'true',
+                    })
+                if missing_tracks:
+                    tracks.extend(missing_tracks)
+                if end >= len(missing_track_ids):
+                    break
 
         return tracks
 
index b2b02f5e2b89a3e4a27f58133f738a6f8904a323..8fc3706df739c56338dde8493df2363ba00e1c35 100644 (file)
@@ -324,7 +324,9 @@ def _extract_video(self, renderer):
             r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
             'view count', default=None))
         uploader = try_get(
-            renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+            renderer,
+            (lambda x: x['ownerText']['runs'][0]['text'],
+             lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
         return {
             '_type': 'url_transparent',
             'ie_key': YoutubeIE.ie_key(),
@@ -340,64 +342,70 @@ def _extract_video(self, renderer):
 
 class YoutubeIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com'
+    _INVIDIOUS_SITES = (
+        # invidious-redirect websites
+        r'(?:www\.)?redirect\.invidious\.io',
+        r'(?:(?:www|dev)\.)?invidio\.us',
+        # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
+        r'(?:www\.)?invidious\.pussthecat\.org',
+        r'(?:www\.)?invidious\.048596\.xyz',
+        r'(?:www\.)?invidious\.zee\.li',
+        r'(?:www\.)?vid\.puffyan\.us',
+        r'(?:(?:www|au)\.)?ytprivate\.com',
+        r'(?:www\.)?invidious\.namazso\.eu',
+        r'(?:www\.)?invidious\.ethibox\.fr',
+        r'(?:www\.)?inv\.skyn3t\.in',
+        r'(?:www\.)?invidious\.himiko\.cloud',
+        r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
+        r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
+        r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
+        r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
+        # youtube-dl invidious instances list
+        r'(?:(?:www|no)\.)?invidiou\.sh',
+        r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
+        r'(?:www\.)?invidious\.kabi\.tk',
+        r'(?:www\.)?invidious\.13ad\.de',
+        r'(?:www\.)?invidious\.mastodon\.host',
+        r'(?:www\.)?invidious\.zapashcanon\.fr',
+        r'(?:www\.)?invidious\.kavin\.rocks',
+        r'(?:www\.)?invidious\.tube',
+        r'(?:www\.)?invidiou\.site',
+        r'(?:www\.)?invidious\.site',
+        r'(?:www\.)?invidious\.xyz',
+        r'(?:www\.)?invidious\.nixnet\.xyz',
+        r'(?:www\.)?invidious\.drycat\.fr',
+        r'(?:www\.)?tube\.poal\.co',
+        r'(?:www\.)?tube\.connect\.cafe',
+        r'(?:www\.)?vid\.wxzm\.sx',
+        r'(?:www\.)?vid\.mint\.lgbt',
+        r'(?:www\.)?yewtu\.be',
+        r'(?:www\.)?yt\.elukerio\.org',
+        r'(?:www\.)?yt\.lelux\.fi',
+        r'(?:www\.)?invidious\.ggc-project\.de',
+        r'(?:www\.)?yt\.maisputain\.ovh',
+        r'(?:www\.)?invidious\.toot\.koeln',
+        r'(?:www\.)?invidious\.fdn\.fr',
+        r'(?:www\.)?watch\.nettohikari\.com',
+        r'(?:www\.)?kgg2m7yk5aybusll\.onion',
+        r'(?:www\.)?qklhadlycap4cnod\.onion',
+        r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
+        r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
+        r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
+        r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
+        r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
+        r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
+    )
     _VALID_URL = r"""(?x)^
                      (
                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
-                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
-                            (?:www\.)?deturl\.com/www\.youtube\.com/|
-                            (?:www\.)?pwnyoutube\.com/|
-                            (?:www\.)?hooktube\.com/|
-                            (?:www\.)?yourepeat\.com/|
-                            tube\.majestyc\.net/|
-                            # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
-                            (?:www\.)?invidious\.pussthecat\.org/|
-                            (?:www\.)?invidious\.048596\.xyz/|
-                            (?:www\.)?invidious\.zee\.li/|
-                            (?:www\.)?vid\.puffyan\.us/|
-                            (?:(?:www|au)\.)?ytprivate\.com/|
-                            (?:www\.)?invidious\.namazso\.eu/|
-                            (?:www\.)?invidious\.ethibox\.fr/|
-                            (?:www\.)?inv\.skyn3t\.in/|
-                            (?:www\.)?invidious\.himiko\.cloud/|
-                            (?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion/|
-                            (?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion/|
-                            (?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion/|
-                            (?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion/|
-                            (?:(?:www|dev)\.)?invidio\.us/|
-                            (?:(?:www|no)\.)?invidiou\.sh/|
-                            (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
-                            (?:www\.)?invidious\.kabi\.tk/|
-                            (?:www\.)?invidious\.13ad\.de/|
-                            (?:www\.)?invidious\.mastodon\.host/|
-                            (?:www\.)?invidious\.zapashcanon\.fr/|
-                            (?:www\.)?invidious\.kavin\.rocks/|
-                            (?:www\.)?invidious\.tube/|
-                            (?:www\.)?invidiou\.site/|
-                            (?:www\.)?invidious\.site/|
-                            (?:www\.)?invidious\.xyz/|
-                            (?:www\.)?invidious\.nixnet\.xyz/|
-                            (?:www\.)?invidious\.drycat\.fr/|
-                            (?:www\.)?tube\.poal\.co/|
-                            (?:www\.)?tube\.connect\.cafe/|
-                            (?:www\.)?vid\.wxzm\.sx/|
-                            (?:www\.)?vid\.mint\.lgbt/|
-                            (?:www\.)?yewtu\.be/|
-                            (?:www\.)?yt\.elukerio\.org/|
-                            (?:www\.)?yt\.lelux\.fi/|
-                            (?:www\.)?invidious\.ggc-project\.de/|
-                            (?:www\.)?yt\.maisputain\.ovh/|
-                            (?:www\.)?invidious\.toot\.koeln/|
-                            (?:www\.)?invidious\.fdn\.fr/|
-                            (?:www\.)?watch\.nettohikari\.com/|
-                            (?:www\.)?kgg2m7yk5aybusll\.onion/|
-                            (?:www\.)?qklhadlycap4cnod\.onion/|
-                            (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
-                            (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
-                            (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
-                            (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
-                            (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
-                            (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
-                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
+                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
+                            (?:www\.)?deturl\.com/www\.youtube\.com|
+                            (?:www\.)?pwnyoutube\.com|
+                            (?:www\.)?hooktube\.com|
+                            (?:www\.)?yourepeat\.com|
+                            tube\.majestyc\.net|
+                            %(invidious)s|
+                            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                          (?:                                                  # the various things that can precede the ID:
                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
@@ -412,6 +420,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                             youtu\.be|                                        # just youtu.be/xxxx
                             vid\.plus|                                        # or vid.plus/xxxx
                             zwearz\.com/watch|                                # or zwearz.com/watch/xxxx
+                            %(invidious)s
                          )/
                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                          )
@@ -424,7 +433,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         )
                      )
                      (?(1).+)?                                                # if we found the ID, everything can follow
-                     $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+                     $""" % {
+        'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
+        'invidious': '|'.join(_INVIDIOUS_SITES),
+    }
     _PLAYER_INFO_RE = (
         r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
         r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
@@ -1031,6 +1043,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'url': 'https://invidio.us/watch?v=BaW_jenozKc',
             'only_matching': True,
         },
+        {
+            'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        },
+        {
+            # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
+            'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
+            'only_matching': True,
+        },
         {
             # DRM protected
             'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
@@ -1169,6 +1190,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            # controversial video, only works with bpctr when authenticated with cookies
+            'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
+            'only_matching': True,
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -1426,7 +1452,7 @@ def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
         base_url = self.http_scheme() + '//www.youtube.com/'
-        webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1'
+        webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
         webpage = self._download_webpage(webpage_url, video_id, fatal=False)
 
         player_response = None
diff --git a/youtube_dlc/extractor/zhihu.py b/youtube_dlc/extractor/zhihu.py
new file mode 100644 (file)
index 0000000..d1ed55b
--- /dev/null
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none, int_or_none
+
+
+class ZhihuIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://www.zhihu.com/zvideo/1342930761977176064',
+        'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464',
+        'info_dict': {
+            'id': '1342930761977176064',
+            'ext': 'mp4',
+            'title': '写春联也太难了吧!',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'uploader': '桥半舫',
+            'timestamp': 1612959715,
+            'upload_date': '20210210',
+            'uploader_id': '244ecb13b0fd7daf92235288c8ca3365',
+            'duration': 146.333,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        zvideo = self._download_json(
+            'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id)
+        title = zvideo['title']
+        video = zvideo.get('video') or {}
+
+        formats = []
+        for format_id, q in (video.get('playlist') or {}).items():
+            play_url = q.get('url') or q.get('play_url')
+            if not play_url:
+                continue
+            formats.append({
+                'asr': int_or_none(q.get('sample_rate')),
+                'filesize': int_or_none(q.get('size')),
+                'format_id': format_id,
+                'fps': int_or_none(q.get('fps')),
+                'height': int_or_none(q.get('height')),
+                'tbr': float_or_none(q.get('bitrate')),
+                'url': play_url,
+                'width': int_or_none(q.get('width')),
+            })
+        self._sort_formats(formats)
+
+        author = zvideo.get('author') or {}
+        url_token = author.get('url_token')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': video.get('thumbnail') or zvideo.get('image_url'),
+            'uploader': author.get('name'),
+            'timestamp': int_or_none(zvideo.get('published_at')),
+            'uploader_id': author.get('id'),
+            'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
+            'duration': float_or_none(video.get('duration')),
+            'view_count': int_or_none(zvideo.get('play_count')),
+            'like_count': int_or_none(zvideo.get('liked_count')),
+            'comment_count': int_or_none(zvideo.get('comment_count')),
+        }
index d1f13f3ea9d4b11de7b07cb2dec601a8c962d15e..926673363b3073034e539e8db259b30448fd7129 100644 (file)
@@ -127,10 +127,13 @@ def is_webp(path):
 
             except PostProcessingError as err:
                 self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err))
-                if not check_executable('AtomicParsley', ['-v']):
+                atomicparsley = next((
+                    x for x in ['AtomicParsley', 'atomicparsley']
+                    if check_executable(x, ['-v'])), None)
+                if atomicparsley is None:
                     raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
 
-                cmd = [encodeFilename('AtomicParsley', True),
+                cmd = [encodeFilename(atomicparsley, True),
                        encodeFilename(filename, True),
                        encodeArgument('--artwork'),
                        encodeFilename(thumbnail_filename, True),