[ie/chzzk] Add extractors (#8887)

[yt-dlp.git] / yt_dlp / extractor / generic.py
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index f9fa01feb8453efa71cbe1e0bb40f8bcd97cf41b..1f0011c09fb429541167a1b9f53f322be16ccfbc 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -14,8 +14,10 @@
      ExtractorError,
      UnsupportedError,
      determine_ext,
+    determine_protocol,
      dict_get,
      extract_basic_auth,
+    filter_dict,
      format_field,
      int_or_none,
      is_html,
@@ -34,6 +36,7 @@
      unsmuggle_url,
      update_url_query,
      url_or_none,
+    urlhandle_detect_ext,
      urljoin,
      variadic,
      xpath_attr,
@@ -57,6 +60,8 @@ class GenericIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'trailer',
                  'upload_date': '20100513',
+                'direct': True,
+                'timestamp': 1273772943.0,
              }
          },
          # Direct link to media delivered compressed (until Accept-Encoding is *)
@@ -100,6 +105,8 @@ class GenericIE(InfoExtractor):
                  'ext': 'webm',
                  'title': '5_Lennart_Poettering_-_Systemd',
                  'upload_date': '20141120',
+                'direct': True,
+                'timestamp': 1416498816.0,
              },
              'expected_warnings': [
                  'URL could be a direct video link, returning it as such.'
@@ -132,6 +139,7 @@ class GenericIE(InfoExtractor):
                      'upload_date': '20201204',
                  },
              }],
+            'skip': 'Dead link',
          },
          # RSS feed with item with description and thumbnails
          {
@@ -144,12 +152,12 @@ class GenericIE(InfoExtractor):
              'playlist': [{
                  'info_dict': {
                      'ext': 'm4a',
-                    'id': 'c1c879525ce2cb640b344507e682c36d',
+                    'id': '818a5d38-01cd-152f-2231-ee479677fa82',
                      'title': 're:Hydrogen!',
                      'description': 're:.*In this episode we are going.*',
                      'timestamp': 1567977776,
                      'upload_date': '20190908',
-                    'duration': 459,
+                    'duration': 423,
                      'thumbnail': r're:^https?://.*\.jpg$',
                      'episode_number': 1,
                      'season_number': 1,
@@ -266,6 +274,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
+            'skip': '404 Not Found',
          },
          # MPD from http://dash-mse-test.appspot.com/media.html
          {
@@ -277,6 +286,7 @@ class GenericIE(InfoExtractor):
                  'title': 'car-20120827-manifest',
                  'formats': 'mincount:9',
                  'upload_date': '20130904',
+                'timestamp': 1378272859.0,
              },
          },
          # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
@@ -317,7 +327,7 @@ class GenericIE(InfoExtractor):
                  'id': 'cmQHVoWB5FY',
                  'ext': 'mp4',
                  'upload_date': '20130224',
-                'uploader_id': 'TheVerge',
+                'uploader_id': '@TheVerge',
                  'description': r're:^Chris Ziegler takes a look at the\.*',
                  'uploader': 'The Verge',
                  'title': 'First Firefox OS phones side-by-side',
@@ -364,46 +374,6 @@ class GenericIE(InfoExtractor):
              },
              'skip': 'There is a limit of 200 free downloads / month for the test song',
          },
-        # ooyala video
-        {
-            'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
-            'md5': '166dd577b433b4d4ebfee10b0824d8ff',
-            'info_dict': {
-                'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
-                'ext': 'mp4',
-                'title': '2cc213299525360.mov',  # that's what we get
-                'duration': 238.231,
-            },
-            'add_ie': ['Ooyala'],
-        },
-        {
-            # ooyala video embedded with http://player.ooyala.com/iframe.js
-            'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
-            'info_dict': {
-                'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
-                'ext': 'mp4',
-                'title': '"Steve Jobs: Man in the Machine" trailer',
-                'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
-                'duration': 135.427,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'movie expired',
-        },
-        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
-        {
-            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
-            'info_dict': {
-                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
-                'ext': 'mp4',
-                'title': 'Steampunk Fest Comes to Honesdale',
-                'duration': 43.276,
-            },
-            'params': {
-                'skip_download': True,
-            }
-        },
          # embed.ly video
          {
              'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -496,7 +466,8 @@ class GenericIE(InfoExtractor):
                  'title': 'Ужастики, русский трейлер (2015)',
                  'thumbnail': r're:^https?://.*\.jpg$',
                  'duration': 153,
-            }
+            },
+            'skip': 'Site dead',
          },
          # XHamster embed
          {
@@ -768,14 +739,16 @@ class GenericIE(InfoExtractor):
              'playlist_mincount': 1,
              'add_ie': ['Youtube'],
          },
-        # Cinchcast embed
+        # Libsyn embed
          {
              'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
              'info_dict': {
-                'id': '7141703',
+                'id': '3793998',
                  'ext': 'mp3',
                  'upload_date': '20141126',
-                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+                'title': 'Underground Wellness Radio - Jack Tips: 5 Steps to Permanent Gut Healing',
+                'thumbnail': 'https://assets.libsyn.com/secure/item/3793998/?height=90&width=90',
+                'duration': 3989.0,
              }
          },
          # Cinerama player
@@ -867,7 +840,7 @@ class GenericIE(InfoExtractor):
              },
          },
          {
-            # Video.js embed, multiple formats
+            # Youtube embed, formerly: Video.js embed, multiple formats
              'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
              'info_dict': {
                  'id': 'yygqldloqIk',
@@ -894,6 +867,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
+            'skip': '404 Not Found',
          },
          # rtl.nl embed
          {
@@ -1556,16 +1530,6 @@ class GenericIE(InfoExtractor):
                  'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
              },
          },
-        {
-            # vzaar embed
-            'url': 'http://help.vzaar.com/article/165-embedding-video',
-            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
-            'info_dict': {
-                'id': '8707641',
-                'ext': 'mp4',
-                'title': 'Building A Business Online: Principal Chairs Q & A',
-            },
-        },
          {
              # multiple HTML5 videos on one page
              'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
@@ -2169,6 +2133,33 @@ class GenericIE(InfoExtractor):
                  'age_limit': 18,
              },
          },
+        {
+            'note': 'Live HLS direct link',
+            'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
+            'info_dict': {
+                'id': 'index',
+                'title': r're:index',
+                'ext': 'mp4',
+                'live_status': 'is_live',
+            },
+            'params': {
+                'skip_download': 'm3u8',
+            },
+        },
+        {
+            'note': 'Video.js VOD HLS',
+            'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
+            'info_dict': {
+                'id': 'videojs_hls_test',
+                'title': 'video',
+                'ext': 'mp4',
+                'age_limit': 0,
+                'duration': 1800,
+            },
+            'params': {
+                'skip_download': 'm3u8',
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2188,7 +2179,6 @@ def report_detected(self, name, num=1, note=None):
      def _extra_manifest_info(self, info, manifest_url):
          fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
          if fragment_query is not None:
-            fragment_query = self._configuration_arg('fragment_query', casesense=True)[0]
              info['extra_param_to_segment_url'] = (
                  urllib.parse.urlparse(fragment_query).query or fragment_query
                  or urllib.parse.urlparse(manifest_url).query or None)
@@ -2206,6 +2196,22 @@ def _extra_manifest_info(self, info, manifest_url):
              for fmt in self._downloader._get_formats(info):
                  fmt['url'] = update_url_query(fmt['url'], query)
  
+        # Attempt to detect live HLS or set VOD duration
+        m3u8_format = next((f for f in self._downloader._get_formats(info)
+                            if determine_protocol(f) == 'm3u8_native'), None)
+        if m3u8_format:
+            is_live = self._configuration_arg('is_live', [None])[0]
+            if is_live is not None:
+                info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
+                return
+            headers = m3u8_format.get('http_headers') or info.get('http_headers')
+            duration = self._extract_m3u8_vod_duration(
+                m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
+                errnote='Failed to download m3u8 media playlist', headers=headers)
+            if not duration:
+                info['live_status'] = 'is_live'
+            info['duration'] = info.get('duration') or duration
+
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
              'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
@@ -2326,7 +2332,7 @@ def _extract_kvs(self, url, webpage, video_id):
              'id': flashvars['video_id'],
              'display_id': display_id,
              'title': title,
-            'thumbnail': thumbnail,
+            'thumbnail': urljoin(url, thumbnail),
              'formats': formats,
          }
  
@@ -2383,11 +2389,11 @@ def _real_extract(self, url):
          # to accept raw bytes and being able to download only a chunk.
          # It may probably better to solve this by checking Content-Type for application/octet-stream
          # after a HEAD request, but not sure if we can rely on this.
-        full_response = self._request_webpage(url, video_id, headers={
+        full_response = self._request_webpage(url, video_id, headers=filter_dict({
              'Accept-Encoding': 'identity',
-            **smuggled_data.get('http_headers', {})
-        })
-        new_url = full_response.geturl()
+            'Referer': smuggled_data.get('referer'),
+        }))
+        new_url = full_response.url
          url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
          if new_url != extract_basic_auth(url)[0]:
              self.report_following_redirect(new_url)
@@ -2406,9 +2412,9 @@ def _real_extract(self, url):
          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
          if m:
              self.report_detected('direct video link')
-            headers = smuggled_data.get('http_headers', {})
+            headers = filter_dict({'Referer': smuggled_data.get('referer')})
              format_id = str(m.group('format_id'))
-            ext = determine_ext(url)
+            ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
              subtitles = {}
              if format_id.endswith('mpegurl') or ext == 'm3u8':
                  formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
@@ -2420,6 +2426,7 @@ def _real_extract(self, url):
                  formats = [{
                      'format_id': format_id,
                      'url': url,
+                    'ext': ext,
                      'vcodec': 'none' if m.group('type') == 'audio' else None
                  }]
                  info_dict['direct'] = True
@@ -2485,12 +2492,12 @@ def _real_extract(self, url):
                  return self.playlist_result(
                      self._parse_xspf(
                          doc, video_id, xspf_url=url,
-                        xspf_base_url=full_response.geturl()),
+                        xspf_base_url=full_response.url),
                      video_id)
              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                  info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
                      doc,
-                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_base_url=full_response.url.rpartition('/')[0],
                      mpd_url=url)
                  self._extra_manifest_info(info_dict, url)
                  self.report_detected('DASH manifest')
@@ -2518,7 +2525,7 @@ def _real_extract(self, url):
          self._downloader.write_debug('Looking for embeds')
          embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
          if len(embeds) == 1:
-            return {**info_dict, **embeds[0]}
+            return merge_dicts(embeds[0], info_dict)
          elif embeds:
              return self.playlist_result(embeds, **info_dict)
          raise UnsupportedError(url)
@@ -2528,7 +2535,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
          info_dict = types.MappingProxyType(info_dict)  # Prevents accidental mutation
          video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
          url, smuggled_data = unsmuggle_url(url, {})
-        actual_url = urlh.geturl() if urlh else url
+        actual_url = urlh.url if urlh else url
  
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
@@ -2581,8 +2588,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
              varname = mobj.group(1)
              sources = variadic(self._parse_json(
                  mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
-            formats = []
-            subtitles = {}
+            formats, subtitles, src = [], {}, None
              for source in sources:
                  src = source.get('src')
                  if not src or not isinstance(src, str):
@@ -2605,8 +2611,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                          m3u8_id='hls', fatal=False)
                      formats.extend(fmts)
                      self._merge_subtitles(subs, target=subtitles)
-                for fmt in formats:
-                    self._extra_manifest_info(fmt, src)
  
                  if not formats:
                      formats.append({
@@ -2622,11 +2626,11 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
              for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
                  sub = self._parse_json(
                      sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
-                src = str_or_none(sub.get('src'))
-                if not src:
+                sub_src = str_or_none(sub.get('src'))
+                if not sub_src:
                      continue
                  subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-                    'url': urllib.parse.urljoin(url, src),
+                    'url': urllib.parse.urljoin(url, sub_src),
                      'name': sub.get('label'),
                      'http_headers': {
                          'Referer': actual_url,
@@ -2634,7 +2638,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                  })
              if formats or subtitles:
                  self.report_detected('video.js embed')
-                return [{'formats': formats, 'subtitles': subtitles}]
+                info_dict = {'formats': formats, 'subtitles': subtitles}
+                if formats:
+                    self._extra_manifest_info(info_dict, src)
+                return [info_dict]
  
          # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
          found = self._search_regex((
@@ -2657,7 +2664,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                  'url': smuggle_url(json_ld['url'], {
                      'force_videoid': video_id,
                      'to_generic': True,
-                    'http_headers': {'Referer': url},
+                    'referer': url,
                  }),
              }, json_ld)]