]> jfr.im git - yt-dlp.git/commitdiff
Update to ytdl-2021.01.16
authorpukkandan <redacted>
Sat, 16 Jan 2021 12:40:15 +0000 (18:10 +0530)
committerpukkandan <redacted>
Sat, 16 Jan 2021 13:20:48 +0000 (18:50 +0530)
14 files changed:
youtube_dlc/YoutubeDL.py
youtube_dlc/extractor/adn.py
youtube_dlc/extractor/animeondemand.py
youtube_dlc/extractor/cspan.py
youtube_dlc/extractor/extractors.py
youtube_dlc/extractor/khanacademy.py
youtube_dlc/extractor/mixcloud.py
youtube_dlc/extractor/peertube.py
youtube_dlc/extractor/spike.py
youtube_dlc/extractor/threeqsdn.py
youtube_dlc/extractor/twitch.py
youtube_dlc/extractor/twitter.py
youtube_dlc/extractor/youporn.py
youtube_dlc/extractor/youtube.py

index 5141159d27051609c37c2cc953c0fc7633c2ba19..5c1129a97502a34930e425f5b53b49cbbc7d6b86 100644 (file)
@@ -369,6 +369,8 @@ class YoutubeDL(object):
     _pps = []
     _download_retcode = None
     _num_downloads = None
+    _playlist_level = 0
+    _playlist_urls = set()
     _screen_file = None
 
     def __init__(self, params=None, auto_init=True):
@@ -1012,113 +1014,23 @@ def process_ie_result(self, ie_result, download=True, extra_info={}):
             return self.process_ie_result(
                 new_result, download=download, extra_info=extra_info)
         elif result_type in ('playlist', 'multi_video'):
-            # We process each entry in the playlist
-            playlist = ie_result.get('title') or ie_result.get('id')
-            self.to_screen('[download] Downloading playlist: %s' % playlist)
-
-            playlist_results = []
-
-            playliststart = self.params.get('playliststart', 1) - 1
-            playlistend = self.params.get('playlistend')
-            # For backwards compatibility, interpret -1 as whole list
-            if playlistend == -1:
-                playlistend = None
-
-            playlistitems_str = self.params.get('playlist_items')
-            playlistitems = None
-            if playlistitems_str is not None:
-                def iter_playlistitems(format):
-                    for string_segment in format.split(','):
-                        if '-' in string_segment:
-                            start, end = string_segment.split('-')
-                            for item in range(int(start), int(end) + 1):
-                                yield int(item)
-                        else:
-                            yield int(string_segment)
-                playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
-
-            ie_entries = ie_result['entries']
-
-            def make_playlistitems_entries(list_ie_entries):
-                num_entries = len(list_ie_entries)
-                return [
-                    list_ie_entries[i - 1] for i in playlistitems
-                    if -num_entries <= i - 1 < num_entries]
-
-            def report_download(num_entries):
+            # Protect from infinite recursion due to recursively nested playlists
+            # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
+            webpage_url = ie_result['webpage_url']
+            if webpage_url in self._playlist_urls:
                 self.to_screen(
-                    '[%s] playlist %s: Downloading %d videos' %
-                    (ie_result['extractor'], playlist, num_entries))
-
-            if isinstance(ie_entries, list):
-                n_all_entries = len(ie_entries)
-                if playlistitems:
-                    entries = make_playlistitems_entries(ie_entries)
-                else:
-                    entries = ie_entries[playliststart:playlistend]
-                n_entries = len(entries)
-                self.to_screen(
-                    '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
-                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
-            elif isinstance(ie_entries, PagedList):
-                if playlistitems:
-                    entries = []
-                    for item in playlistitems:
-                        entries.extend(ie_entries.getslice(
-                            item - 1, item
-                        ))
-                else:
-                    entries = ie_entries.getslice(
-                        playliststart, playlistend)
-                n_entries = len(entries)
-                report_download(n_entries)
-            else:  # iterable
-                if playlistitems:
-                    entries = make_playlistitems_entries(list(itertools.islice(
-                        ie_entries, 0, max(playlistitems))))
-                else:
-                    entries = list(itertools.islice(
-                        ie_entries, playliststart, playlistend))
-                n_entries = len(entries)
-                report_download(n_entries)
-
-            if self.params.get('playlistreverse', False):
-                entries = entries[::-1]
-
-            if self.params.get('playlistrandom', False):
-                random.shuffle(entries)
-
-            x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
-
-            for i, entry in enumerate(entries, 1):
-                self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
-                # This __x_forwarded_for_ip thing is a bit ugly but requires
-                # minimal changes
-                if x_forwarded_for:
-                    entry['__x_forwarded_for_ip'] = x_forwarded_for
-                extra = {
-                    'n_entries': n_entries,
-                    'playlist': playlist,
-                    'playlist_id': ie_result.get('id'),
-                    'playlist_title': ie_result.get('title'),
-                    'playlist_uploader': ie_result.get('uploader'),
-                    'playlist_uploader_id': ie_result.get('uploader_id'),
-                    'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
-                    'extractor': ie_result['extractor'],
-                    'webpage_url': ie_result['webpage_url'],
-                    'webpage_url_basename': url_basename(ie_result['webpage_url']),
-                    'extractor_key': ie_result['extractor_key'],
-                }
-
-                if self._match_entry(entry, incomplete=True) is not None:
-                    continue
+                    '[download] Skipping already downloaded playlist: %s'
+                    % ie_result.get('title') or ie_result.get('id'))
+                return
 
-                entry_result = self.__process_iterable_entry(entry, download, extra)
-                # TODO: skip failed (empty) entries?
-                playlist_results.append(entry_result)
-            ie_result['entries'] = playlist_results
-            self.to_screen('[download] Finished downloading playlist: %s' % playlist)
-            return ie_result
+            self._playlist_level += 1
+            self._playlist_urls.add(webpage_url)
+            try:
+                return self.__process_playlist(ie_result, download)
+            finally:
+                self._playlist_level -= 1
+                if not self._playlist_level:
+                    self._playlist_urls.clear()
         elif result_type == 'compat_list':
             self.report_warning(
                 'Extractor %s returned a compat_list result. '
@@ -1143,6 +1055,115 @@ def _fixup(r):
         else:
             raise Exception('Invalid result type: %s' % result_type)
 
+    def __process_playlist(self, ie_result, download):
+        # We process each entry in the playlist
+        playlist = ie_result.get('title') or ie_result.get('id')
+        self.to_screen('[download] Downloading playlist: %s' % playlist)
+
+        playlist_results = []
+
+        playliststart = self.params.get('playliststart', 1) - 1
+        playlistend = self.params.get('playlistend')
+        # For backwards compatibility, interpret -1 as whole list
+        if playlistend == -1:
+            playlistend = None
+
+        playlistitems_str = self.params.get('playlist_items')
+        playlistitems = None
+        if playlistitems_str is not None:
+            def iter_playlistitems(format):
+                for string_segment in format.split(','):
+                    if '-' in string_segment:
+                        start, end = string_segment.split('-')
+                        for item in range(int(start), int(end) + 1):
+                            yield int(item)
+                    else:
+                        yield int(string_segment)
+            playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
+
+        ie_entries = ie_result['entries']
+
+        def make_playlistitems_entries(list_ie_entries):
+            num_entries = len(list_ie_entries)
+            return [
+                list_ie_entries[i - 1] for i in playlistitems
+                if -num_entries <= i - 1 < num_entries]
+
+        def report_download(num_entries):
+            self.to_screen(
+                '[%s] playlist %s: Downloading %d videos' %
+                (ie_result['extractor'], playlist, num_entries))
+
+        if isinstance(ie_entries, list):
+            n_all_entries = len(ie_entries)
+            if playlistitems:
+                entries = make_playlistitems_entries(ie_entries)
+            else:
+                entries = ie_entries[playliststart:playlistend]
+            n_entries = len(entries)
+            self.to_screen(
+                '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
+                (ie_result['extractor'], playlist, n_all_entries, n_entries))
+        elif isinstance(ie_entries, PagedList):
+            if playlistitems:
+                entries = []
+                for item in playlistitems:
+                    entries.extend(ie_entries.getslice(
+                        item - 1, item
+                    ))
+            else:
+                entries = ie_entries.getslice(
+                    playliststart, playlistend)
+            n_entries = len(entries)
+            report_download(n_entries)
+        else:  # iterable
+            if playlistitems:
+                entries = make_playlistitems_entries(list(itertools.islice(
+                    ie_entries, 0, max(playlistitems))))
+            else:
+                entries = list(itertools.islice(
+                    ie_entries, playliststart, playlistend))
+            n_entries = len(entries)
+            report_download(n_entries)
+
+        if self.params.get('playlistreverse', False):
+            entries = entries[::-1]
+
+        if self.params.get('playlistrandom', False):
+            random.shuffle(entries)
+
+        x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
+
+        for i, entry in enumerate(entries, 1):
+            self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
+            # This __x_forwarded_for_ip thing is a bit ugly but requires
+            # minimal changes
+            if x_forwarded_for:
+                entry['__x_forwarded_for_ip'] = x_forwarded_for
+            extra = {
+                'n_entries': n_entries,
+                'playlist': playlist,
+                'playlist_id': ie_result.get('id'),
+                'playlist_title': ie_result.get('title'),
+                'playlist_uploader': ie_result.get('uploader'),
+                'playlist_uploader_id': ie_result.get('uploader_id'),
+                'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
+                'extractor': ie_result['extractor'],
+                'webpage_url': ie_result['webpage_url'],
+                'webpage_url_basename': url_basename(ie_result['webpage_url']),
+                'extractor_key': ie_result['extractor_key'],
+            }
+
+            if self._match_entry(entry, incomplete=True) is not None:
+                continue
+
+            entry_result = self.__process_iterable_entry(entry, download, extra)
+            # TODO: skip failed (empty) entries?
+            playlist_results.append(entry_result)
+        ie_result['entries'] = playlist_results
+        self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+        return ie_result
+
     @__handle_extraction_exceptions
     def __process_iterable_entry(self, entry, download, extra_info):
         return self.process_ie_result(
index c95ad2173522455d925b3248065ce914a35e3f0c..d611ee23746d1079087c5aef0a3edda539ed3a77 100644 (file)
@@ -10,6 +10,7 @@
 from .common import InfoExtractor
 from ..aes import aes_cbc_decrypt
 from ..compat import (
+    compat_HTTPError,
     compat_b64decode,
     compat_ord,
 )
     bytes_to_long,
     ExtractorError,
     float_or_none,
+    int_or_none,
     intlist_to_bytes,
     long_to_bytes,
     pkcs1pad,
     strip_or_none,
-    urljoin,
+    try_get,
+    unified_strdate,
 )
 
 
@@ -31,16 +34,27 @@ class ADNIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
     _TEST = {
         'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
-        'md5': 'e497370d847fd79d9d4c74be55575c7a',
+        'md5': '0319c99885ff5547565cacb4f3f9348d',
         'info_dict': {
             'id': '7778',
             'ext': 'mp4',
-            'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
+            'title': 'Blue Exorcist - Kyôto Saga - Episode 1',
             'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
+            'series': 'Blue Exorcist - Kyôto Saga',
+            'duration': 1467,
+            'release_date': '20170106',
+            'comment_count': int,
+            'average_rating': float,
+            'season_number': 2,
+            'episode': 'Début des hostilités',
+            'episode_number': 1,
         }
     }
+
     _BASE_URL = 'http://animedigitalnetwork.fr'
-    _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537)
+    _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/'
+    _PLAYER_BASE_URL = _API_BASE_URL + 'player/'
+    _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537)
     _POS_ALIGN_MAP = {
         'start': 1,
         'end': 3,
@@ -54,26 +68,24 @@ class ADNIE(InfoExtractor):
     def _ass_subtitles_timecode(seconds):
         return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)
 
-    def _get_subtitles(self, sub_path, video_id):
-        if not sub_path:
+    def _get_subtitles(self, sub_url, video_id):
+        if not sub_url:
             return None
 
         enc_subtitles = self._download_webpage(
-            urljoin(self._BASE_URL, sub_path),
-            video_id, 'Downloading subtitles location', fatal=False) or '{}'
+            sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}'
         subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location')
         if subtitle_location:
             enc_subtitles = self._download_webpage(
-                urljoin(self._BASE_URL, subtitle_location),
-                video_id, 'Downloading subtitles data', fatal=False,
-                headers={'Origin': 'https://animedigitalnetwork.fr'})
+                subtitle_location, video_id, 'Downloading subtitles data',
+                fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'})
         if not enc_subtitles:
             return None
 
         # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
         dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
             bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
-            bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')),
+            bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')),
             bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
         ))
         subtitles_json = self._parse_json(
@@ -119,59 +131,76 @@ def _get_subtitles(self, sub_path, video_id):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        player_config = self._parse_json(self._search_regex(
-            r'playerConfig\s*=\s*({.+});', webpage,
-            'player config', default='{}'), video_id, fatal=False)
-        if not player_config:
-            config_url = urljoin(self._BASE_URL, self._search_regex(
-                r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"',
-                webpage, 'config url'))
-            player_config = self._download_json(
-                config_url, video_id,
-                'Downloading player config JSON metadata')['player']
-
-        video_info = {}
-        video_info_str = self._search_regex(
-            r'videoInfo\s*=\s*({.+});', webpage,
-            'video info', fatal=False)
-        if video_info_str:
-            video_info = self._parse_json(
-                video_info_str, video_id, fatal=False) or {}
-
-        options = player_config.get('options') or {}
-        metas = options.get('metas') or {}
-        links = player_config.get('links') or {}
-        sub_path = player_config.get('subtitles')
-        error = None
-        if not links:
-            links_url = player_config.get('linksurl') or options['videoUrl']
-            token = options['token']
-            self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
-            message = bytes_to_intlist(json.dumps({
-                'k': self._K,
-                'e': 60,
-                't': token,
-            }))
+        video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id
+        player = self._download_json(
+            video_base_url + 'configuration', video_id,
+            'Downloading player config JSON metadata')['player']
+        options = player['options']
+
+        user = options['user']
+        if not user.get('hasAccess'):
+            raise ExtractorError(
+                'This video is only available for paying users', expected=True)
+            # self.raise_login_required() # FIXME: Login is not implemented
+
+        token = self._download_json(
+            user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'),
+            video_id, 'Downloading access token', headers={
+                'x-player-refresh-token': user['refreshToken']
+            }, data=b'')['token']
+
+        links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
+        self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
+        message = bytes_to_intlist(json.dumps({
+            'k': self._K,
+            't': token,
+        }))
+
+        # Sometimes authentication fails for no good reason, retry with
+        # a different random padding
+        links_data = None
+        for _ in range(3):
             padded_message = intlist_to_bytes(pkcs1pad(message, 128))
             n, e = self._RSA_KEY
             encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
             authorization = base64.b64encode(encrypted_message).decode()
-            links_data = self._download_json(
-                urljoin(self._BASE_URL, links_url), video_id,
-                'Downloading links JSON metadata', headers={
-                    'Authorization': 'Bearer ' + authorization,
-                })
-            links = links_data.get('links') or {}
-            metas = metas or links_data.get('meta') or {}
-            sub_path = sub_path or links_data.get('subtitles') or \
-                'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id
-            sub_path += '&token=' + token
-            error = links_data.get('error')
-        title = metas.get('title') or video_info['title']
+
+            try:
+                links_data = self._download_json(
+                    links_url, video_id, 'Downloading links JSON metadata', headers={
+                        'X-Player-Token': authorization
+                    }, query={
+                        'freeWithAds': 'true',
+                        'adaptive': 'false',
+                        'withMetadata': 'true',
+                        'source': 'Web'
+                    })
+                break
+            except ExtractorError as e:
+                if not isinstance(e.cause, compat_HTTPError):
+                    raise e
+
+                if e.cause.code == 401:
+                    # This usually goes away with a different random pkcs1pad, so retry
+                    continue
+
+                error = self._parse_json(e.cause.read(), video_id)
+                message = error.get('message')
+                if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
+                    self.raise_geo_restricted(msg=message)
+                else:
+                    raise ExtractorError(message)
+        else:
+            raise ExtractorError('Giving up retrying')
+
+        links = links_data.get('links') or {}
+        metas = links_data.get('metadata') or {}
+        sub_url = (links.get('subtitles') or {}).get('all')
+        video_info = links_data.get('video') or {}
+        title = metas['title']
 
         formats = []
-        for format_id, qualities in links.items():
+        for format_id, qualities in (links.get('streaming') or {}).items():
             if not isinstance(qualities, dict):
                 continue
             for quality, load_balancer_url in qualities.items():
@@ -189,19 +218,26 @@ def _real_extract(self, url):
                     for f in m3u8_formats:
                         f['language'] = 'fr'
                 formats.extend(m3u8_formats)
-        if not error:
-            error = options.get('error')
-        if not formats and error:
-            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
         self._sort_formats(formats)
 
+        video = (self._download_json(
+            self._API_BASE_URL + 'video/%s' % video_id, video_id,
+            'Downloading additional video metadata', fatal=False) or {}).get('video') or {}
+        show = video.get('show') or {}
+
         return {
             'id': video_id,
             'title': title,
-            'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
-            'thumbnail': video_info.get('image'),
+            'description': strip_or_none(metas.get('summary') or video.get('summary')),
+            'thumbnail': video_info.get('image') or player.get('image'),
             'formats': formats,
-            'subtitles': self.extract_subtitles(sub_path, video_id),
-            'episode': metas.get('subtitle') or video_info.get('videoTitle'),
-            'series': video_info.get('playlistTitle'),
+            'subtitles': self.extract_subtitles(sub_url, video_id),
+            'episode': metas.get('subtitle') or video.get('name'),
+            'episode_number': int_or_none(video.get('shortNumber')),
+            'series': show.get('title'),
+            'season_number': int_or_none(video.get('season')),
+            'duration': int_or_none(video_info.get('duration') or video.get('duration')),
+            'release_date': unified_strdate(video.get('releaseDate')),
+            'average_rating': float_or_none(video.get('rating') or metas.get('rating')),
+            'comment_count': int_or_none(video.get('commentsCount')),
         }
index 00ce684d1c9811ef3ba885194c23fd012d935b6b..54e097d2f756e7c73e6e9b84e00a7d5b0c04b19e 100644 (file)
@@ -116,8 +116,6 @@ def _real_extract(self, url):
             r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
             webpage, 'anime description', default=None)
 
-        entries = []
-
         def extract_info(html, video_id, num=None):
             title, description = [None] * 2
             formats = []
@@ -233,7 +231,7 @@ def extract_entries(html, video_id, common_info, num=None):
                 self._sort_formats(info['formats'])
                 f = common_info.copy()
                 f.update(info)
-                entries.append(f)
+                yield f
 
             # Extract teaser/trailer only when full episode is not available
             if not info['formats']:
@@ -247,7 +245,7 @@ def extract_entries(html, video_id, common_info, num=None):
                         'title': m.group('title'),
                         'url': urljoin(url, m.group('href')),
                     })
-                    entries.append(f)
+                    yield f
 
         def extract_episodes(html):
             for num, episode_html in enumerate(re.findall(
@@ -275,7 +273,8 @@ def extract_episodes(html):
                     'episode_number': episode_number,
                 }
 
-                extract_entries(episode_html, video_id, common_info)
+                for e in extract_entries(episode_html, video_id, common_info):
+                    yield e
 
         def extract_film(html, video_id):
             common_info = {
@@ -283,11 +282,18 @@ def extract_film(html, video_id):
                 'title': anime_title,
                 'description': anime_description,
             }
-            extract_entries(html, video_id, common_info)
+            for e in extract_entries(html, video_id, common_info):
+                yield e
 
-        extract_episodes(webpage)
+        def entries():
+            has_episodes = False
+            for e in extract_episodes(webpage):
+                has_episodes = True
+                yield e
 
-        if not entries:
-            extract_film(webpage, anime_id)
+            if not has_episodes:
+                for e in extract_film(webpage, anime_id):
+                    yield e
 
-        return self.playlist_result(entries, anime_id, anime_title, anime_description)
+        return self.playlist_result(
+            entries(), anime_id, anime_title, anime_description)
index 766942146ff60458061bd35c048b6dd399e253f1..2e01aff488f00609744b5c16ba757fdcf0a193d2 100644 (file)
@@ -8,11 +8,14 @@
     ExtractorError,
     extract_attributes,
     find_xpath_attr,
+    get_element_by_attribute,
     get_element_by_class,
     int_or_none,
     js_to_json,
     merge_dicts,
+    parse_iso8601,
     smuggle_url,
+    str_to_int,
     unescapeHTML,
 )
 from .senateisvp import SenateISVPIE
@@ -116,8 +119,30 @@ def add_referer(formats):
                 jwsetup, video_id, require_title=False, m3u8_id='hls',
                 base_url=url)
             add_referer(info['formats'])
+            for subtitles in info['subtitles'].values():
+                for subtitle in subtitles:
+                    ext = determine_ext(subtitle['url'])
+                    if ext == 'php':
+                        ext = 'vtt'
+                    subtitle['ext'] = ext
             ld_info = self._search_json_ld(webpage, video_id, default={})
-            return merge_dicts(info, ld_info)
+            title = get_element_by_class('video-page-title', webpage) or \
+                self._og_search_title(webpage)
+            description = get_element_by_attribute('itemprop', 'description', webpage) or \
+                self._html_search_meta(['og:description', 'description'], webpage)
+            return merge_dicts(info, ld_info, {
+                'title': title,
+                'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),
+                'description': description,
+                'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),
+                'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),
+                'duration': int_or_none(self._search_regex(
+                    r'jwsetup\.seclength\s*=\s*(\d+);',
+                    webpage, 'duration', fatal=False)),
+                'view_count': str_to_int(self._search_regex(
+                    r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",
+                    webpage, 'views', fatal=False)),
+            })
 
         # Obsolete
         # We first look for clipid, because clipprog always appears before
index 08d19017fbc7869bc704ecd1374af62c782e5177..8b322466bc7f1c387410e6da6c7fb2d09a674b3c 100644 (file)
 from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .ketnet import KetnetIE
-from .khanacademy import KhanAcademyIE
+from .khanacademy import (
+    KhanAcademyIE,
+    KhanAcademyUnitIE,
+)
 from .kickstarter import KickStarterIE
 from .kinja import KinjaEmbedIE
 from .kinopoisk import KinoPoiskIE
index 61739efa7a4c3b84892083eab10237c23eb69e3d..87e520378b264beac046af3c12f844d2ff843df2 100644 (file)
 from __future__ import unicode_literals
 
-import re
+import json
 
 from .common import InfoExtractor
 from ..utils import (
-    unified_strdate,
+    int_or_none,
+    parse_iso8601,
+    try_get,
 )
 
 
-class KhanAcademyIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])'
-    IE_NAME = 'KhanAcademy'
+class KhanAcademyBaseIE(InfoExtractor):
+    _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
 
-    _TESTS = [{
-        'url': 'http://www.khanacademy.org/video/one-time-pad',
-        'md5': '7b391cce85e758fb94f763ddc1bbb979',
+    def _parse_video(self, video):
+        return {
+            '_type': 'url_transparent',
+            'url': video['youtubeId'],
+            'id': video.get('slug'),
+            'title': video.get('title'),
+            'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
+            'duration': int_or_none(video.get('duration')),
+            'description': video.get('description'),
+            'ie_key': 'Youtube',
+        }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        component_props = self._parse_json(self._download_json(
+            'https://www.khanacademy.org/api/internal/graphql',
+            display_id, query={
+                'hash': 1604303425,
+                'variables': json.dumps({
+                    'path': display_id,
+                    'queryParams': '',
+                }),
+            })['data']['contentJson'], display_id)['componentProps']
+        return self._parse_component_props(component_props)
+
+
+class KhanAcademyIE(KhanAcademyBaseIE):
+    IE_NAME = 'khanacademy'
+    _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
+    _TEST = {
+        'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
+        'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
         'info_dict': {
-            'id': 'one-time-pad',
-            'ext': 'webm',
+            'id': 'FlIG3TvQCBQ',
+            'ext': 'mp4',
             'title': 'The one-time pad',
             'description': 'The perfect cipher',
             'duration': 176,
             'uploader': 'Brit Cruise',
             'uploader_id': 'khanacademy',
             'upload_date': '20120411',
+            'timestamp': 1334170113,
+            'license': 'cc-by-nc-sa',
         },
         'add_ie': ['Youtube'],
-    }, {
-        'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
+    }
+
+    def _parse_component_props(self, component_props):
+        video = component_props['tutorialPageData']['contentModel']
+        info = self._parse_video(video)
+        author_names = video.get('authorNames')
+        info.update({
+            'uploader': ', '.join(author_names) if author_names else None,
+            'timestamp': parse_iso8601(video.get('dateAdded')),
+            'license': video.get('kaUserLicense'),
+        })
+        return info
+
+
+class KhanAcademyUnitIE(KhanAcademyBaseIE):
+    IE_NAME = 'khanacademy:unit'
+    _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
+    _TEST = {
+        'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
         'info_dict': {
             'id': 'cryptography',
-            'title': 'Journey into cryptography',
+            'title': 'Cryptography',
             'description': 'How have humans protected their secret messages through history? What has changed today?',
         },
-        'playlist_mincount': 3,
-    }]
-
-    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('id')
+        'playlist_mincount': 31,
+    }
 
-        if m.group('key') == 'video':
-            data = self._download_json(
-                'http://api.khanacademy.org/api/v1/videos/' + video_id,
-                video_id, 'Downloading video info')
+    def _parse_component_props(self, component_props):
+        curation = component_props['curation']
 
-            upload_date = unified_strdate(data['date_added'])
-            uploader = ', '.join(data['author_names'])
-            return {
-                '_type': 'url_transparent',
-                'url': data['url'],
-                'id': video_id,
-                'title': data['title'],
-                'thumbnail': data['image_url'],
-                'duration': data['duration'],
-                'description': data['description'],
-                'uploader': uploader,
-                'upload_date': upload_date,
+        entries = []
+        tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
+        for tutorial_number, tutorial in enumerate(tutorials, 1):
+            chapter_info = {
+                'chapter': tutorial.get('title'),
+                'chapter_number': tutorial_number,
+                'chapter_id': tutorial.get('id'),
             }
-        else:
-            # topic
-            data = self._download_json(
-                'http://api.khanacademy.org/api/v1/topic/' + video_id,
-                video_id, 'Downloading topic info')
+            for content_item in (tutorial.get('contentItems') or []):
+                if content_item.get('kind') == 'Video':
+                    info = self._parse_video(content_item)
+                    info.update(chapter_info)
+                    entries.append(info)
 
-            entries = [
-                {
-                    '_type': 'url',
-                    'url': c['url'],
-                    'id': c['id'],
-                    'title': c['title'],
-                }
-                for c in data['children'] if c['kind'] in ('Video', 'Topic')]
-
-            return {
-                '_type': 'playlist',
-                'id': video_id,
-                'title': data['title'],
-                'description': data['description'],
-                'entries': entries,
-            }
+        return self.playlist_result(
+            entries, curation.get('unit'), curation.get('title'),
+            curation.get('description'))
index 9759560f1bee5642c7e4f11c8aa8891fd4b3ed7d..69319857dfb872fee9137d78cd89c433d812a594 100644 (file)
@@ -251,8 +251,11 @@ def _real_extract(self, url):
                 cloudcast_url = cloudcast.get('url')
                 if not cloudcast_url:
                     continue
+                slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
+                owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
+                video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None
                 entries.append(self.url_result(
-                    cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug')))
+                    cloudcast_url, MixcloudIE.ie_key(), video_id))
 
             page_info = items['pageInfo']
             has_next_page = page_info['hasNextPage']
@@ -321,7 +324,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
     _DESCRIPTION_KEY = 'biog'
     _ROOT_TYPE = 'user'
     _NODE_TEMPLATE = '''slug
-          url'''
+          url
+          owner { username }'''
 
     def _get_playlist_title(self, title, slug):
         return '%s (%s)' % (title, slug)
@@ -345,6 +349,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
     _NODE_TEMPLATE = '''cloudcast {
             slug
             url
+            owner { username }
           }'''
 
     def _get_cloudcast(self, node):
index c39d12728d4fb2aa61494de77c9964d76d45d355..c2ca71c71d3badaa942987b54dc5b8564b09bc81 100644 (file)
@@ -450,6 +450,18 @@ class PeerTubeIE(InfoExtractor):
             'tags': ['framasoft', 'peertube'],
             'categories': ['Science & Technology'],
         }
+    }, {
+        # Issue #26002
+        'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc',
+        'info_dict': {
+            'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc',
+            'ext': 'mp4',
+            'title': 'Dot matrix printer shell demo',
+            'uploader_id': '3',
+            'timestamp': 1587401293,
+            'upload_date': '20200420',
+            'uploader': 'Drew DeVault',
+        }
     }, {
         'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
         'only_matching': True,
@@ -526,7 +538,15 @@ def _real_extract(self, url):
         title = video['name']
 
         formats = []
-        for file_ in video['files']:
+        files = video.get('files') or []
+        for playlist in (video.get('streamingPlaylists') or []):
+            if not isinstance(playlist, dict):
+                continue
+            playlist_files = playlist.get('files')
+            if not (playlist_files and isinstance(playlist_files, list)):
+                continue
+            files.extend(playlist_files)
+        for file_ in files:
             if not isinstance(file_, dict):
                 continue
             file_url = url_or_none(file_.get('fileUrl'))
index 3cee331f6a74efdd341426d9d8f5b035e41e7186..4180e71efa5eb045b25be3b45258dbb2ad4b73dd 100644 (file)
@@ -50,9 +50,15 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
         },
     }]
 
-    _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/'
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
     _GEO_COUNTRIES = ['US']
 
+    def _get_feed_query(self, uri):
+        return {
+            'arcEp': 'paramountnetwork.com',
+            'mgid': uri,
+        }
+
     def _extract_mgid(self, webpage, url):
         root_data = self._parse_json(self._search_regex(
             r'window\.__DATA__\s*=\s*({.+})',
index f26937da1e3eaecf008d28bd6a661d0a22657ddd..f6d37bb9e0d847903eab4f9210757d41ce5cec70 100644 (file)
@@ -3,10 +3,13 @@
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_HTTPError
 from ..utils import (
     determine_ext,
-    js_to_json,
-    mimetype2ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
 )
 
 
@@ -15,29 +18,35 @@ class ThreeQSDNIE(InfoExtractor):
     IE_DESC = '3Q SDN'
     _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
     _TESTS = [{
-        # ondemand from http://www.philharmonie.tv/veranstaltung/26/
-        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
-        'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+        # https://player.3qsdn.com/demo.html
+        'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be',
+        'md5': '64a57396b16fa011b15e0ea60edce918',
         'info_dict': {
-            'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'id': '7201c779-6b3c-11e7-a40e-002590c750be',
             'ext': 'mp4',
-            'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'title': 'Video Ads',
             'is_live': False,
+            'description': 'Video Ads Demo',
+            'timestamp': 1500334803,
+            'upload_date': '20170717',
+            'duration': 888.032,
+            'subtitles': {
+                'eng': 'count:1',
+            },
         },
-        'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'],
+        'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'],
     }, {
         # live video stream
-        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+        'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be',
         'info_dict': {
-            'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'id': '66e68995-11ca-11e8-9273-002590c750be',
             'ext': 'mp4',
-            'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
             'is_live': True,
         },
         'params': {
             'skip_download': True,  # m3u8 downloads
         },
-        'expected_warnings': ['Failed to download MPD manifest'],
     }, {
         # live audio stream
         'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
@@ -58,6 +67,14 @@ class ThreeQSDNIE(InfoExtractor):
         # live video with rtmp link
         'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
         'only_matching': True,
+    }, {
+        # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+        'only_matching': True,
+    }, {
+        # live video stream
+        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -70,73 +87,78 @@ def _extract_url(webpage):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        js = self._download_webpage(
-            'http://playout.3qsdn.com/%s' % video_id, video_id,
-            query={'js': 'true'})
-
-        if any(p in js for p in (
-                '>This content is not available in your country',
-                'playout.3qsdn.com/forbidden')):
-            self.raise_geo_restricted()
-
-        stream_content = self._search_regex(
-            r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
-            'stream content', default='demand', group='content')
+        try:
+            config = self._download_json(
+                url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                self.raise_geo_restricted()
+            raise
 
-        live = stream_content == 'live'
-
-        stream_type = self._search_regex(
-            r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
-            'stream type', default='video', group='type')
+        live = config.get('streamContent') == 'live'
+        aspect = float_or_none(config.get('aspect'))
 
         formats = []
-        urls = set()
-
-        def extract_formats(item_url, item={}):
-            if not item_url or item_url in urls:
-                return
-            urls.add(item_url)
-            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
-            if ext == 'mpd':
+        for source_type, source in (config.get('sources') or {}).items():
+            if not source:
+                continue
+            if source_type == 'dash':
                 formats.extend(self._extract_mpd_formats(
-                    item_url, video_id, mpd_id='mpd', fatal=False))
-            elif ext == 'm3u8':
+                    source, video_id, mpd_id='mpd', fatal=False))
+            elif source_type == 'hls':
                 formats.extend(self._extract_m3u8_formats(
-                    item_url, video_id, 'mp4',
-                    entry_protocol='m3u8' if live else 'm3u8_native',
+                    source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
                     m3u8_id='hls', fatal=False))
-            elif ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    item_url, video_id, f4m_id='hds', fatal=False))
-            else:
-                if not self._is_valid_url(item_url, video_id):
-                    return
-                formats.append({
-                    'url': item_url,
-                    'format_id': item.get('quality'),
-                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,
-                    'vcodec': 'none' if stream_type == 'audio' else None,
-                })
-
-        for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js):
-            f = self._parse_json(
-                item_js, video_id, transform_source=js_to_json, fatal=False)
-            if not f:
+            elif source_type == 'progressive':
+                for s in source:
+                    src = s.get('src')
+                    if not (src and self._is_valid_url(src, video_id)):
+                        continue
+                    width = None
+                    format_id = ['http']
+                    ext = determine_ext(src)
+                    if ext:
+                        format_id.append(ext)
+                    height = int_or_none(s.get('height'))
+                    if height:
+                        format_id.append('%dp' % height)
+                        if aspect:
+                            width = int(height * aspect)
+                    formats.append({
+                        'ext': ext,
+                        'format_id': '-'.join(format_id),
+                        'height': height,
+                        'source_preference': 0,
+                        'url': src,
+                        'vcodec': 'none' if height == 0 else None,
+                        'width': width,
+                    })
+        for f in formats:
+            if f.get('acodec') == 'none':
+                f['preference'] = -40
+            elif f.get('vcodec') == 'none':
+                f['preference'] = -50
+        self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id'))
+
+        subtitles = {}
+        for subtitle in (config.get('subtitles') or []):
+            src = subtitle.get('src')
+            if not src:
                 continue
-            extract_formats(f.get('src'), f)
-
-        # More relaxed version to collect additional URLs and acting
-        # as a future-proof fallback
-        for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
-            extract_formats(src)
-
-        self._sort_formats(formats)
+            subtitles.setdefault(subtitle.get('label') or 'eng', []).append({
+                'url': src,
+            })
 
-        title = self._live_title(video_id) if live else video_id
+        title = config.get('title') or video_id
 
         return {
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if live else title,
+            'thumbnail': config.get('poster') or None,
+            'description': config.get('description') or None,
+            'timestamp': parse_iso8601(config.get('upload_date')),
+            'duration': float_or_none(config.get('vlength')) or None,
             'is_live': live,
             'formats': formats,
+            'subtitles': subtitles,
         }
index 503d019de185c4f4dcd2f24f75c6467a6f7baa48..fc8cb73217a7699bc6f798ece23154e3f5316571 100644 (file)
@@ -17,6 +17,7 @@
 )
 from ..utils import (
     clean_html,
+    dict_get,
     ExtractorError,
     float_or_none,
     int_or_none,
@@ -76,14 +77,14 @@ def login_step(page, urlh, note, data):
 
             headers = {
                 'Referer': page_url,
-                'Origin': page_url,
+                'Origin': 'https://www.twitch.tv',
                 'Content-Type': 'text/plain;charset=UTF-8',
             }
 
             response = self._download_json(
                 post_url, None, note, data=json.dumps(form).encode(),
                 headers=headers, expected_status=400)
-            error = response.get('error_description') or response.get('error_code')
+            error = dict_get(response, ('error', 'error_description', 'error_code'))
             if error:
                 fail(error)
 
@@ -137,13 +138,17 @@ def _prefer_source(self, formats):
         self._sort_formats(formats)
 
     def _download_base_gql(self, video_id, ops, note, fatal=True):
+        headers = {
+            'Content-Type': 'text/plain;charset=UTF-8',
+            'Client-ID': self._CLIENT_ID,
+        }
+        gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token')
+        if gql_auth:
+            headers['Authorization'] = 'OAuth ' + gql_auth.value
         return self._download_json(
             'https://gql.twitch.tv/gql', video_id, note,
             data=json.dumps(ops).encode(),
-            headers={
-                'Content-Type': 'text/plain;charset=UTF-8',
-                'Client-ID': self._CLIENT_ID,
-            }, fatal=fatal)
+            headers=headers, fatal=fatal)
 
     def _download_gql(self, video_id, ops, note, fatal=True):
         for op in ops:
index 4602c0984184238ec24b4f5614c692309de5879c..8a2a77b710e40a83c764dfca1361951e8412e636 100644 (file)
@@ -373,6 +373,24 @@ class TwitterIE(TwitterBaseIE):
             'uploader_id': '1eVjYOLGkGrQL',
         },
         'add_ie': ['TwitterBroadcast'],
+    }, {
+        # unified card
+        'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20',
+        'info_dict': {
+            'id': '1349794411333394432',
+            'ext': 'mp4',
+            'title': 'md5:d1c4941658e4caaa6cb579260d85dcba',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': 'md5:71ead15ec44cee55071547d6447c6a3e',
+            'uploader': 'Brooklyn Nets',
+            'uploader_id': 'BrooklynNets',
+            'duration': 324.484,
+            'timestamp': 1610651040,
+            'upload_date': '20210114',
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         # Twitch Clip Embed
         'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
@@ -389,6 +407,22 @@ class TwitterIE(TwitterBaseIE):
         # appplayer card
         'url': 'https://twitter.com/poco_dandy/status/1150646424461176832',
         'only_matching': True,
+    }, {
+        # video_direct_message card
+        'url': 'https://twitter.com/qarev001/status/1348948114569269251',
+        'only_matching': True,
+    }, {
+        # poll2choice_video card
+        'url': 'https://twitter.com/CAF_Online/status/1349365911120195585',
+        'only_matching': True,
+    }, {
+        # poll3choice_video card
+        'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984',
+        'only_matching': True,
+    }, {
+        # poll4choice_video card
+        'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -433,8 +467,7 @@ def _real_extract(self, url):
             'tags': tags,
         }
 
-        media = try_get(status, lambda x: x['extended_entities']['media'][0])
-        if media and media.get('type') != 'photo':
+        def extract_from_video_info(media):
             video_info = media.get('video_info') or {}
 
             formats = []
@@ -461,6 +494,10 @@ def add_thumbnail(name, size):
                 'thumbnails': thumbnails,
                 'duration': float_or_none(video_info.get('duration_millis'), 1000),
             })
+
+        media = try_get(status, lambda x: x['extended_entities']['media'][0])
+        if media and media.get('type') != 'photo':
+            extract_from_video_info(media)
         else:
             card = status.get('card')
             if card:
@@ -493,7 +530,12 @@ def get_binding_value(k):
                         '_type': 'url',
                         'url': get_binding_value('card_url'),
                     })
-                # amplify, promo_video_website, promo_video_convo, appplayer, ...
+                elif card_name == 'unified_card':
+                    media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities']
+                    extract_from_video_info(next(iter(media_entities.values())))
+                # amplify, promo_video_website, promo_video_convo, appplayer,
+                # video_direct_message, poll2choice_video, poll3choice_video,
+                # poll4choice_video, ...
                 else:
                     is_amplify = card_name == 'amplify'
                     vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
index 7b9feafeb2a6beb75a8f7e9424fd74af0b227c3f..534270bac3d461bfca5bf0035e6435829413811d 100644 (file)
@@ -60,6 +60,9 @@ class YouPornIE(InfoExtractor):
     }, {
         'url': 'http://www.youporn.com/watch/505835',
         'only_matching': True,
+    }, {
+        'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -88,7 +91,7 @@ def _real_extract(self, url):
         # Main source
         definitions = self._parse_json(
             self._search_regex(
-                r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
+                r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage,
                 'media definitions', default='[]'),
             video_id, fatal=False)
         if definitions:
@@ -100,7 +103,7 @@ def _real_extract(self, url):
                     links.append(video_url)
 
         # Fallback #1, this also contains extra low quality 180p format
-        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
+        for _, link in re.findall(r'<a[^>]+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage):
             links.append(link)
 
         # Fallback #2 (unavailable as at 22.06.2017)
@@ -128,8 +131,9 @@ def _real_extract(self, url):
             # Video URL's path looks like this:
             #  /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
             #  /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+            #  /videos/201703/11/109285532/1080P_4000K_109285532.mp4
             # We will benefit from it by extracting some metadata
-            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
             if mobj:
                 height = int(mobj.group('height'))
                 bitrate = int(mobj.group('bitrate'))
index 0b87f2185e8325713c1f661cf62d29726572a643..20657bb1961183dacd68373f1ad4dbce6e650930 100644 (file)
@@ -332,6 +332,36 @@ def _extract_ytcfg(self, video_id, webpage):
                 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
                 default='{}'), video_id, fatal=False)
 
+    def _extract_video(self, renderer):
+        video_id = renderer.get('videoId')
+        title = try_get(
+            renderer,
+            (lambda x: x['title']['runs'][0]['text'],
+             lambda x: x['title']['simpleText']), compat_str)
+        description = try_get(
+            renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
+            compat_str)
+        duration = parse_duration(try_get(
+            renderer, lambda x: x['lengthText']['simpleText'], compat_str))
+        view_count_text = try_get(
+            renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+        view_count = str_to_int(self._search_regex(
+            r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
+            'view count', default=None))
+        uploader = try_get(
+            renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': YoutubeIE.ie_key(),
+            'id': video_id,
+            'url': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'uploader': uploader,
+        }
+
 
 class YoutubeIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com'
@@ -2871,36 +2901,6 @@ def _extract_grid_item_renderer(item):
             if renderer:
                 return renderer
 
-    def _extract_video(self, renderer):
-        video_id = renderer.get('videoId')
-        title = try_get(
-            renderer,
-            (lambda x: x['title']['runs'][0]['text'],
-             lambda x: x['title']['simpleText']), compat_str)
-        description = try_get(
-            renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
-            compat_str)
-        duration = parse_duration(try_get(
-            renderer, lambda x: x['lengthText']['simpleText'], compat_str))
-        view_count_text = try_get(
-            renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
-        view_count = str_to_int(self._search_regex(
-            r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
-            'view count', default=None))
-        uploader = try_get(
-            renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
-        return {
-            '_type': 'url_transparent',
-            'ie_key': YoutubeIE.ie_key(),
-            'id': video_id,
-            'url': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'view_count': view_count,
-            'uploader': uploader,
-        }
-
     def _grid_entries(self, grid_renderer):
         for item in grid_renderer['items']:
             if not isinstance(item, dict):
@@ -3583,65 +3583,38 @@ def _entries(self, query, n):
             if not slr_contents:
                 break
 
-            isr_contents = []
-            continuation_token = None
             # Youtube sometimes adds promoted content to searches,
             # changing the index location of videos and token.
             # So we search through all entries till we find them.
-            for index, isr in enumerate(slr_contents):
+            continuation_token = None
+            for slr_content in slr_contents:
+                isr_contents = try_get(
+                    slr_content,
+                    lambda x: x['itemSectionRenderer']['contents'],
+                    list)
                 if not isr_contents:
-                    isr_contents = try_get(
-                        slr_contents,
-                        (lambda x: x[index]['itemSectionRenderer']['contents']),
-                        list)
-                    for content in isr_contents:
-                        if content.get('videoRenderer') is not None:
-                            break
-                    else:
-                        isr_contents = []
+                    continue
+                for content in isr_contents:
+                    if not isinstance(content, dict):
+                        continue
+                    video = content.get('videoRenderer')
+                    if not isinstance(video, dict):
+                        continue
+                    video_id = video.get('videoId')
+                    if not video_id:
+                        continue
+
+                    yield self._extract_video(video)
+                    total += 1
+                    if total == n:
+                        return
 
                 if continuation_token is None:
                     continuation_token = try_get(
-                        slr_contents,
-                        lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][
-                            'token'],
+                        slr_content,
+                        lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
                         compat_str)
-                if continuation_token is not None and isr_contents:
-                    break
 
-            if not isr_contents:
-                break
-            for content in isr_contents:
-                if not isinstance(content, dict):
-                    continue
-                video = content.get('videoRenderer')
-                if not isinstance(video, dict):
-                    continue
-                video_id = video.get('videoId')
-                if not video_id:
-                    continue
-                title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
-                description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
-                duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
-                view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
-                view_count = str_to_int(self._search_regex(
-                    r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
-                    'view count', default=None))
-                uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
-                total += 1
-                yield {
-                    '_type': 'url_transparent',
-                    'ie_key': YoutubeIE.ie_key(),
-                    'id': video_id,
-                    'url': video_id,
-                    'title': title,
-                    'description': description,
-                    'duration': duration,
-                    'view_count': view_count,
-                    'uploader': uploader,
-                }
-                if total == n:
-                    return
             if not continuation_token:
                 break
             data['continuation'] = continuation_token