[extractor/generic] Add `fragment_query` extractor arg for DASH and HLS (#5528)

author bashonly <redacted>

Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)

committer GitHub <redacted>

Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)
author bashonly <redacted>
Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)
committer GitHub <redacted>
Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)
diff --git a/README.md b/README.md

index f336dcb6acbc7dde68e0a05b67541d1f1f3fbcb6..fa55d130bb632ccd81bb2f39d5b8be170d5f73d4 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1736,6 +1736,9 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.)
  * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
  * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off
  
+#### generic
+* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg
+
  #### funimation
  * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese`
  * `version`: The video version to extract - `uncut` or `simulcast`
diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py

index 8723e1068965a9ffe61f1546372a0932addb6c6e..4328d739c2696ed8a97784e9417bfcbeaa3e5e76 100644 (file)
--- a/yt_dlp/downloader/dash.py
+++ b/yt_dlp/downloader/dash.py
@@ -1,8 +1,9 @@
  import time
+import urllib.parse
  
  from . import get_suitable_downloader
  from .fragment import FragmentFD
-from ..utils import urljoin
+from ..utils import update_url_query, urljoin
  
  
  class DashSegmentsFD(FragmentFD):
@@ -40,7 +41,12 @@ def real_download(self, filename, info_dict):
                  self._prepare_and_start_frag_download(ctx, fmt)
              ctx['start'] = real_start
  
-            fragments_to_download = self._get_fragments(fmt, ctx)
+            extra_query = None
+            extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
+            if extra_param_to_segment_url:
+                extra_query = urllib.parse.parse_qs(extra_param_to_segment_url)
+
+            fragments_to_download = self._get_fragments(fmt, ctx, extra_query)
  
              if real_downloader:
                  self.to_screen(
@@ -57,7 +63,7 @@ def _resolve_fragments(self, fragments, ctx):
          fragments = fragments(ctx) if callable(fragments) else fragments
          return [next(iter(fragments))] if self.params.get('test') else fragments
  
-    def _get_fragments(self, fmt, ctx):
+    def _get_fragments(self, fmt, ctx, extra_query):
          fragment_base_url = fmt.get('fragment_base_url')
          fragments = self._resolve_fragments(fmt['fragments'], ctx)
  
@@ -70,6 +76,8 @@ def _get_fragments(self, fmt, ctx):
              if not fragment_url:
                  assert fragment_base_url
                  fragment_url = urljoin(fragment_base_url, fragment['path'])
+            if extra_query:
+                fragment_url = update_url_query(fragment_url, extra_query)
  
              yield {
                  'frag_index': frag_index,
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 5da77273d804886f5ad3f73979112b895f6628ef..2fcbc6f43f830c68f51547b95d7de9952f89b43a 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2189,6 +2189,13 @@ def report_detected(self, name, num=1, note=None):
  
          self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
  
+    def _fragment_query(self, url):
+        if self._configuration_arg('fragment_query'):
+            query_string = urllib.parse.urlparse(url).query
+            if query_string:
+                return {'extra_param_to_segment_url': query_string}
+        return {}
+
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
              'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
@@ -2351,8 +2358,10 @@ def _real_extract(self, url):
              subtitles = {}
              if format_id.endswith('mpegurl'):
                  formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
+                info_dict.update(self._fragment_query(url))
              elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
                  formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
+                info_dict.update(self._fragment_query(url))
              elif format_id == 'f4m':
                  formats = self._extract_f4m_formats(url, video_id, headers=headers)
              else:
@@ -2379,6 +2388,7 @@ def _real_extract(self, url):
          if first_bytes.startswith(b'#EXTM3U'):
              self.report_detected('M3U playlist')
              info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
+            info_dict.update(self._fragment_query(url))
              return info_dict
  
          # Maybe it's a direct link to a video?
@@ -2429,6 +2439,7 @@ def _real_extract(self, url):
                      doc,
                      mpd_base_url=full_response.geturl().rpartition('/')[0],
                      mpd_url=url)
+                info_dict.update(self._fragment_query(url))
                  self.report_detected('DASH manifest')
                  return info_dict
              elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@@ -2541,7 +2552,10 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                          m3u8_id='hls', fatal=False)
                      formats.extend(fmts)
                      self._merge_subtitles(subs, target=subtitles)
-                else:
+                for fmt in formats:
+                    fmt.update(self._fragment_query(src))
+
+                if not formats:
                      formats.append({
                          'url': src,
                          'ext': (mimetype2ext(src_type)
@@ -2776,8 +2790,10 @@ def filter_video(urls):
                  return [self._extract_xspf_playlist(video_url, video_id)]
              elif ext == 'm3u8':
                  entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
+                entry_info_dict.update(self._fragment_query(video_url))
              elif ext == 'mpd':
                  entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
+                entry_info_dict.update(self._fragment_query(video_url))
              elif ext == 'f4m':
                  entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
              elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
author	bashonly <redacted>
	Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)
committer	GitHub <redacted>
	Mon, 21 Nov 2022 00:51:45 +0000 (00:51 +0000)
README.md		patch \| blob \| blame \| history
yt_dlp/downloader/dash.py		patch \| blob \| blame \| history
yt_dlp/extractor/generic.py		patch \| blob \| blame \| history