[ant1newsgr] Add extractor (#1982)

[yt-dlp.git] / yt_dlp / extractor / vimeo.py
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py

index 04c504934283c631ed65dbe82aa9bf299216607f..458a751fe1ddc6bfb31b4387f19a45448349c6fc 100644 (file)
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -18,6 +18,7 @@
      determine_ext,
      ExtractorError,
      get_element_by_class,
+    HEADRequest,
      js_to_json,
      int_or_none,
      merge_dicts,
@@ -27,7 +28,6 @@
      parse_qs,
      sanitized_Request,
      smuggle_url,
-    std_headers,
      str_or_none,
      try_get,
      unified_timestamp,
@@ -35,6 +35,7 @@
      urlencode_postdata,
      urljoin,
      unescapeHTML,
+    urlhandle_detect_ext,
  )
  
  
@@ -117,10 +118,9 @@ def _set_vimeo_cookie(self, name, value):
          self._set_cookie('vimeo.com', name, value)
  
      def _vimeo_sort_formats(self, formats):
-        # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
-        # at the same time without actual units specified. This lead to wrong sorting.
-        # But since yt-dlp prefers 'res,fps' anyway, 'field_preference' is not needed
-        self._sort_formats(formats)
+        # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+        # at the same time without actual units specified.
+        self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source'))
  
      def _parse_config(self, config, video_id):
          video_data = config['video']
@@ -130,6 +130,8 @@ def _parse_config(self, config, video_id):
          request = config.get('request') or {}
  
          formats = []
+        subtitles = {}
+
          config_files = video_data.get('files') or request.get('files') or {}
          for f in (config_files.get('progressive') or []):
              video_url = f.get('url')
@@ -138,6 +140,7 @@ def _parse_config(self, config, video_id):
              formats.append({
                  'url': video_url,
                  'format_id': 'http-%s' % f.get('quality'),
+                'source_preference': 10,
                  'width': int_or_none(f.get('width')),
                  'height': int_or_none(f.get('height')),
                  'fps': int_or_none(f.get('fps')),
@@ -161,21 +164,23 @@ def _parse_config(self, config, video_id):
                      sep_manifest_urls = [(format_id, manifest_url)]
                  for f_id, m_url in sep_manifest_urls:
                      if files_type == 'hls':
-                        formats.extend(self._extract_m3u8_formats(
-                            m_url, video_id, 'mp4',
-                            'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
+                        fmts, subs = self._extract_m3u8_formats_and_subtitles(
+                            m_url, video_id, 'mp4', live=is_live, m3u8_id=f_id,
                              note='Downloading %s m3u8 information' % cdn_name,
-                            fatal=False))
+                            fatal=False)
+                        formats.extend(fmts)
+                        self._merge_subtitles(subs, target=subtitles)
                      elif files_type == 'dash':
                          if 'json=1' in m_url:
                              real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
                              if real_m_url:
                                  m_url = real_m_url
-                        mpd_formats = self._extract_mpd_formats(
+                        fmts, subs = self._extract_mpd_formats_and_subtitles(
                              m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
                              'Downloading %s MPD information' % cdn_name,
                              fatal=False)
-                        formats.extend(mpd_formats)
+                        formats.extend(fmts)
+                        self._merge_subtitles(subs, target=subtitles)
  
          live_archive = live_event.get('archive') or {}
          live_archive_source_url = live_archive.get('source_url')
@@ -186,12 +191,11 @@ def _parse_config(self, config, video_id):
                  'quality': 10,
              })
  
-        subtitles = {}
          for tt in (request.get('text_tracks') or []):
-            subtitles[tt['lang']] = [{
+            subtitles.setdefault(tt['lang'], []).append({
                  'ext': 'vtt',
                  'url': urljoin('https://vimeo.com', tt['url']),
-            }]
+            })
  
          thumbnails = []
          if not is_live:
@@ -210,14 +214,25 @@ def _parse_config(self, config, video_id):
          owner = video_data.get('owner') or {}
          video_uploader_url = owner.get('url')
  
+        duration = int_or_none(video_data.get('duration'))
+        chapter_data = try_get(config, lambda x: x['embed']['chapters']) or []
+        chapters = [{
+            'title': current_chapter.get('title'),
+            'start_time': current_chapter.get('timecode'),
+            'end_time': next_chapter.get('timecode'),
+        } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])]
+        if chapters and chapters[0]['start_time']:  # Chapters may not start from 0
+            chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}]
+
          return {
              'id': str_or_none(video_data.get('id')) or video_id,
-            'title': self._live_title(video_title) if is_live else video_title,
+            'title': video_title,
              'uploader': owner.get('name'),
              'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
              'uploader_url': video_uploader_url,
              'thumbnails': thumbnails,
-            'duration': int_or_none(video_data.get('duration')),
+            'duration': duration,
+            'chapters': chapters or None,
              'formats': formats,
              'subtitles': subtitles,
              'is_live': is_live,
@@ -229,27 +244,26 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None):
              query['unlisted_hash'] = unlisted_hash
          download_data = self._download_json(
              url, video_id, fatal=False, query=query,
-            headers={'X-Requested-With': 'XMLHttpRequest'})
-        if download_data:
-            source_file = download_data.get('source_file')
-            if isinstance(source_file, dict):
-                download_url = source_file.get('download_url')
-                if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
-                    source_name = source_file.get('public_name', 'Original')
-                    if self._is_valid_url(download_url, video_id, '%s video' % source_name):
-                        ext = (try_get(
-                            source_file, lambda x: x['extension'],
-                            compat_str) or determine_ext(
-                            download_url, None) or 'mp4').lower()
-                        return {
-                            'url': download_url,
-                            'ext': ext,
-                            'width': int_or_none(source_file.get('width')),
-                            'height': int_or_none(source_file.get('height')),
-                            'filesize': parse_filesize(source_file.get('size')),
-                            'format_id': source_name,
-                            'quality': 1,
-                        }
+            headers={'X-Requested-With': 'XMLHttpRequest'},
+            expected_status=(403, 404)) or {}
+        source_file = download_data.get('source_file')
+        download_url = try_get(source_file, lambda x: x['download_url'])
+        if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
+            source_name = source_file.get('public_name', 'Original')
+            if self._is_valid_url(download_url, video_id, '%s video' % source_name):
+                ext = (try_get(
+                    source_file, lambda x: x['extension'],
+                    compat_str) or determine_ext(
+                    download_url, None) or 'mp4').lower()
+                return {
+                    'url': download_url,
+                    'ext': ext,
+                    'width': int_or_none(source_file.get('width')),
+                    'height': int_or_none(source_file.get('height')),
+                    'filesize': parse_filesize(source_file.get('size')),
+                    'format_id': source_name,
+                    'quality': 1,
+                }
  
          jwt_response = self._download_json(
              'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {}
@@ -258,15 +272,19 @@ def _extract_original_format(self, url, video_id, unlisted_hash=None):
          headers = {'Authorization': 'jwt %s' % jwt_response['jwt']}
          original_response = self._download_json(
              f'https://api.vimeo.com/videos/{video_id}', video_id,
-            headers=headers, fatal=False) or {}
-        for download_data in original_response.get('download') or {}:
+            headers=headers, fatal=False, expected_status=(403, 404)) or {}
+        for download_data in original_response.get('download') or []:
              download_url = download_data.get('link')
              if not download_url or download_data.get('quality') != 'source':
                  continue
-            query = parse_qs(download_url)
+            ext = determine_ext(parse_qs(download_url).get('filename', [''])[0].lower(), default_ext=None)
+            if not ext:
+                urlh = self._request_webpage(
+                    HEADRequest(download_url), video_id, fatal=False, note='Determining source extension')
+                ext = urlh and urlhandle_detect_ext(urlh)
              return {
                  'url': download_url,
-                'ext': determine_ext(query.get('filename', [''])[0].lower()),
+                'ext': ext or 'unknown_video',
                  'format_id': download_data.get('public_name', 'Original'),
                  'width': int_or_none(download_data.get('width')),
                  'height': int_or_none(download_data.get('height')),
@@ -291,7 +309,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                          )?
                          vimeo(?:pro)?\.com/
                          (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
-                        (?:.*?/)?
+                        (?:[^/]+/)*?
                          (?:
                              (?:
                                  play_redirect_hls|
@@ -362,7 +380,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'params': {
                  'format': 'best[protocol=https]',
              },
-            'expected_warnings': ['Unable to download JSON metadata'],
          },
          {
              'url': 'http://vimeo.com/68375962',
@@ -402,7 +419,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                  'upload_date': '20130928',
                  'duration': 187,
              },
-            'expected_warnings': ['Unable to download JSON metadata'],
+            'params': {'format': 'http-1080p'},
          },
          {
              'url': 'http://vimeo.com/76979871',
@@ -424,7 +441,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
                      'es': [{'ext': 'vtt'}],
                      'fr': [{'ext': 'vtt'}],
                  },
-            }
+            },
+            'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'],
          },
          {
              # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/
@@ -469,7 +487,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
                  'description': 'md5:f2edc61af3ea7a5592681ddbb683db73',
                  'upload_date': '20200225',
              },
-            'expected_warnings': ['Unable to download JSON metadata'],
          },
          {
              # only available via https://vimeo.com/channels/tributes/6213729 and
@@ -491,7 +508,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'expected_warnings': ['Unable to download JSON metadata'],
          },
          {
              # redirects to ondemand extractor and should be passed through it
@@ -511,7 +527,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'params': {
                  'skip_download': True,
              },
-            'expected_warnings': ['Unable to download JSON metadata'],
              'skip': 'this page is no longer available.',
          },
          {
@@ -572,13 +587,72 @@ class VimeoIE(VimeoBaseInfoExtractor):
              'only_matching': True,
          },
          {
+            'note': 'Direct URL with hash',
              'url': 'https://vimeo.com/160743502/abd0e13fb4',
-            'only_matching': True,
+            'info_dict': {
+                'id': '160743502',
+                'ext': 'mp4',
+                'uploader': 'Julian Tryba',
+                'uploader_id': 'aliniamedia',
+                'title': 'Harrisville New Hampshire',
+                'timestamp': 1459259666,
+                'upload_date': '20160329',
+            },
+            'params': {'skip_download': True},
+        },
+        {
+            'url': 'https://vimeo.com/138909882',
+            'info_dict': {
+                'id': '138909882',
+                'ext': 'mp4',
+                'title': 'Eastnor Castle 2015 Firework Champions - The Promo!',
+                'description': 'md5:5967e090768a831488f6e74b7821b3c1',
+                'uploader_id': 'fireworkchampions',
+                'uploader': 'Firework Champions',
+                'upload_date': '20150910',
+                'timestamp': 1441901895,
+            },
+            'params': {
+                'skip_download': True,
+                'format': 'Original',
+            },
+        },
+        {
+            'url': 'https://vimeo.com/channels/staffpicks/143603739',
+            'info_dict': {
+                'id': '143603739',
+                'ext': 'mp4',
+                'uploader': 'Karim Huu Do',
+                'timestamp': 1445846953,
+                'upload_date': '20151026',
+                'title': 'The Shoes - Submarine Feat. Blaine Harrison',
+                'uploader_id': 'karimhd',
+                'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843',
+            },
+            'params': {'skip_download': 'm3u8'},
          },
          {
              # requires passing unlisted_hash(a52724358e) to load_download_config request
              'url': 'https://vimeo.com/392479337/a52724358e',
              'only_matching': True,
+        },
+        {
+            # similar, but all numeric: ID must be 581039021, not 9603038895
+            # issue #29690
+            'url': 'https://vimeo.com/581039021/9603038895',
+            'info_dict': {
+                'id': '581039021',
+                # these have to be provided but we don't care
+                'ext': 'mp4',
+                'timestamp': 1627621014,
+                'title': 're:.+',
+                'uploader_id': 're:.+',
+                'uploader': 're:.+',
+                'upload_date': r're:\d+',
+            },
+            'params': {
+                'skip_download': True,
+            },
          }
          # https://gettingthingsdone.com/workflowmap/
          # vimeo embed with check-password page protected by Referer header
@@ -701,14 +775,15 @@ def _try_album_password(self, url):
  
      def _real_extract(self, url):
          url, data = unsmuggle_url(url, {})
-        headers = std_headers.copy()
+        headers = self.get_param('http_headers').copy()
          if 'http_headers' in data:
              headers.update(data['http_headers'])
          if 'Referer' not in headers:
              headers['Referer'] = url
  
          # Extract ID from URL
-        video_id, unlisted_hash = self._match_valid_url(url).groups()
+        mobj = self._match_valid_url(url).groupdict()
+        video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash')
          if unlisted_hash:
              return self._extract_from_api(video_id, unlisted_hash)
  
@@ -768,18 +843,19 @@ def _real_extract(self, url):
          timestamp = None
          video_description = None
          info_dict = {}
+        config_url = None
  
          channel_id = self._search_regex(
              r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
          if channel_id:
              config_url = self._html_search_regex(
-                r'\bdata-config-url="([^"]+)"', webpage, 'config URL')
+                r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None)
              video_description = clean_html(get_element_by_class('description', webpage))
              info_dict.update({
                  'channel_id': channel_id,
                  'channel_url': 'https://vimeo.com/channels/' + channel_id,
              })
-        else:
+        if not config_url:
              page_config = self._parse_json(self._search_regex(
                  r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
                  webpage, 'page config', default='{}'), video_id, fatal=False)
@@ -1100,10 +1176,10 @@ class VimeoGroupsIE(VimeoChannelIE):
      IE_NAME = 'vimeo:group'
      _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)'
      _TESTS = [{
-        'url': 'https://vimeo.com/groups/kattykay',
+        'url': 'https://vimeo.com/groups/meetup',
          'info_dict': {
-            'id': 'kattykay',
-            'title': 'Katty Kay',
+            'id': 'meetup',
+            'title': 'Vimeo Meetup!',
          },
          'playlist_mincount': 27,
      }]
@@ -1125,7 +1201,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
              'uploader_id': 'user21297594',
              'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks",
          },
-        'expected_warnings': ['Unable to download JSON metadata'],
      }, {
          'note': 'video player needs Referer',
          'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',