[extractor/generic] Attempt to detect live HLS (#6775)

author bashonly <redacted>

Thu, 13 Apr 2023 19:36:06 +0000 (14:36 -0500)

committer GitHub <redacted>

Thu, 13 Apr 2023 19:36:06 +0000 (19:36 +0000)
author bashonly <redacted>
Thu, 13 Apr 2023 19:36:06 +0000 (14:36 -0500)
committer GitHub <redacted>
Thu, 13 Apr 2023 19:36:06 +0000 (19:36 +0000)
diff --git a/README.md b/README.md

index 3e8484314f9c13555d5e2ba4832b490590e379b4..35229f728ee309a3d8c09e99f022d95f9ef4e568 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1800,6 +1800,7 @@ #### generic
  * `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments if no value is provided, or else apply the query string given as `fragment_query=VALUE`. Does not apply to ffmpeg
  * `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs if no value is provided, or else apply the query string given as `variant_query=VALUE`
  * `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist
+* `is_live`: Bypass live HLS detection and manually set `live_status` - a value of `false` will set `not_live`, any other value (or no value) will set `is_live`
  
  #### funimation
  * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese`
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 75355aeb5bf40cc068da1dad76431af5cd42d401..87cf11d6bdef84c163a63b390bca86d810b066df 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -14,6 +14,7 @@
      ExtractorError,
      UnsupportedError,
      determine_ext,
+    determine_protocol,
      dict_get,
      extract_basic_auth,
      format_field,
@@ -867,7 +868,7 @@ class GenericIE(InfoExtractor):
              },
          },
          {
-            # Video.js embed, multiple formats
+            # Youtube embed, formerly: Video.js embed, multiple formats
              'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
              'info_dict': {
                  'id': 'yygqldloqIk',
@@ -894,6 +895,7 @@ class GenericIE(InfoExtractor):
              'params': {
                  'skip_download': True,
              },
+            'skip': '404 Not Found',
          },
          # rtl.nl embed
          {
@@ -2169,6 +2171,33 @@ class GenericIE(InfoExtractor):
                  'age_limit': 18,
              },
          },
+        {
+            'note': 'Live HLS direct link',
+            'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
+            'info_dict': {
+                'id': 'index',
+                'title': r're:index',
+                'ext': 'mp4',
+                'live_status': 'is_live',
+            },
+            'params': {
+                'skip_download': 'm3u8',
+            },
+        },
+        {
+            'note': 'Video.js VOD HLS',
+            'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
+            'info_dict': {
+                'id': 'videojs_hls_test',
+                'title': 'video',
+                'ext': 'mp4',
+                'age_limit': 0,
+                'duration': 1800,
+            },
+            'params': {
+                'skip_download': 'm3u8',
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2205,6 +2234,22 @@ def _extra_manifest_info(self, info, manifest_url):
              for fmt in self._downloader._get_formats(info):
                  fmt['url'] = update_url_query(fmt['url'], query)
  
+        # Attempt to detect live HLS or set VOD duration
+        m3u8_format = next((f for f in self._downloader._get_formats(info)
+                            if determine_protocol(f) == 'm3u8_native'), None)
+        if m3u8_format:
+            is_live = self._configuration_arg('is_live', [None])[0]
+            if is_live is not None:
+                info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
+                return
+            headers = m3u8_format.get('http_headers') or info.get('http_headers')
+            duration = self._extract_m3u8_vod_duration(
+                m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
+                errnote='Failed to download m3u8 media playlist', headers=headers)
+            if not duration:
+                info['live_status'] = 'is_live'
+            info['duration'] = info.get('duration') or duration
+
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
              'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
@@ -2580,8 +2625,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
              varname = mobj.group(1)
              sources = variadic(self._parse_json(
                  mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
-            formats = []
-            subtitles = {}
+            formats, subtitles, src = [], {}, None
              for source in sources:
                  src = source.get('src')
                  if not src or not isinstance(src, str):
@@ -2604,8 +2648,6 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                          m3u8_id='hls', fatal=False)
                      formats.extend(fmts)
                      self._merge_subtitles(subs, target=subtitles)
-                for fmt in formats:
-                    self._extra_manifest_info(fmt, src)
  
                  if not formats:
                      formats.append({
@@ -2621,11 +2663,11 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
              for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
                  sub = self._parse_json(
                      sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
-                src = str_or_none(sub.get('src'))
-                if not src:
+                sub_src = str_or_none(sub.get('src'))
+                if not sub_src:
                      continue
                  subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-                    'url': urllib.parse.urljoin(url, src),
+                    'url': urllib.parse.urljoin(url, sub_src),
                      'name': sub.get('label'),
                      'http_headers': {
                          'Referer': actual_url,
@@ -2633,7 +2675,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                  })
              if formats or subtitles:
                  self.report_detected('video.js embed')
-                return [{'formats': formats, 'subtitles': subtitles}]
+                info_dict = {'formats': formats, 'subtitles': subtitles}
+                if formats:
+                    self._extra_manifest_info(info_dict, src)
+                return [info_dict]
  
          # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
          found = self._search_regex((
author	bashonly <redacted>
	Thu, 13 Apr 2023 19:36:06 +0000 (14:36 -0500)
committer	GitHub <redacted>
	Thu, 13 Apr 2023 19:36:06 +0000 (19:36 +0000)
README.md		patch \| blob \| blame \| history
yt_dlp/extractor/generic.py		patch \| blob \| blame \| history