[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

[yt-dlp.git] / yt_dlp / extractor / generic.py
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 2281c71f3db169f6487dd7df4945f58bf23ac611..075bb36ded8ef7f583898bc6d5348f6c78fcd0dd 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -15,6 +15,7 @@
      UnsupportedError,
      determine_ext,
      dict_get,
+    extract_basic_auth,
      format_field,
      int_or_none,
      is_html,
@@ -23,6 +24,7 @@
      mimetype2ext,
      orderedSet,
      parse_duration,
+    parse_qs,
      parse_resolution,
      smuggle_url,
      str_or_none,
@@ -31,7 +33,9 @@
      unescapeHTML,
      unified_timestamp,
      unsmuggle_url,
+    update_url_query,
      url_or_none,
+    urljoin,
      variadic,
      xpath_attr,
      xpath_text,
@@ -863,20 +867,6 @@ class GenericIE(InfoExtractor):
                  'thumbnail': r're:^https?://.*\.jpg$',
              },
          },
-        {
-            # JWPlayer config passed as variable
-            'url': 'http://www.txxx.com/videos/3326530/ariele/',
-            'info_dict': {
-                'id': '3326530_hq',
-                'ext': 'mp4',
-                'title': 'ARIELE | Tube Cup',
-                'uploader': 'www.txxx.com',
-                'age_limit': 18,
-            },
-            'params': {
-                'skip_download': True,
-            }
-        },
          {
              # Video.js embed, multiple formats
              'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
@@ -1867,11 +1857,13 @@ class GenericIE(InfoExtractor):
                  'display_id': 'kelis-4th-of-july',
                  'ext': 'mp4',
                  'title': 'Kelis - 4th Of July',
-                'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+                'description': 'Kelis - 4th Of July',
+                'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
              },
              'params': {
                  'skip_download': True,
              },
+            'expected_warnings': ['Untested major version'],
          }, {
              # KVS Player
              'url': 'https://www.kvs-demo.com/embed/105/',
@@ -1880,35 +1872,12 @@ class GenericIE(InfoExtractor):
                  'display_id': 'kelis-4th-of-july',
                  'ext': 'mp4',
                  'title': 'Kelis - 4th Of July / Embed Player',
-                'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+                'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
              },
              'params': {
                  'skip_download': True,
              },
          }, {
-            # KVS Player
-            'url': 'https://thisvid.com/videos/french-boy-pantsed/',
-            'md5': '3397979512c682f6b85b3b04989df224',
-            'info_dict': {
-                'id': '2400174',
-                'display_id': 'french-boy-pantsed',
-                'ext': 'mp4',
-                'title': 'French Boy Pantsed - ThisVid.com',
-                'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
-            }
-        }, {
-            # KVS Player
-            'url': 'https://thisvid.com/embed/2400174/',
-            'md5': '3397979512c682f6b85b3b04989df224',
-            'info_dict': {
-                'id': '2400174',
-                'display_id': 'french-boy-pantsed',
-                'ext': 'mp4',
-                'title': 'French Boy Pantsed - ThisVid.com',
-                'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
-            }
-        }, {
-            # KVS Player
              'url': 'https://youix.com/video/leningrad-zoj/',
              'md5': '94f96ba95706dc3880812b27b7d8a2b8',
              'info_dict': {
@@ -1916,8 +1885,8 @@ class GenericIE(InfoExtractor):
                  'display_id': 'leningrad-zoj',
                  'ext': 'mp4',
                  'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
-                'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
-            }
+                'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+            },
          }, {
              # KVS Player
              'url': 'https://youix.com/embed/18485',
@@ -1927,19 +1896,20 @@ class GenericIE(InfoExtractor):
                  'display_id': 'leningrad-zoj',
                  'ext': 'mp4',
                  'title': 'Ленинград - ЗОЖ',
-                'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
-            }
+                'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+            },
          }, {
              # KVS Player
              'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
              'md5': '94166bdb26b4cb1fb9214319a629fc51',
              'info_dict': {
                  'id': '21217',
-                'display_id': '40-nochey-40-nights-2016',
+                'display_id': '40-nochey-2016',
                  'ext': 'mp4',
                  'title': '40 ночей (2016) - BogMedia.org',
+                'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
                  'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
-            }
+            },
          },
          {
              # KVS Player (for sites that serve kt_player.js via non-https urls)
@@ -1949,9 +1919,9 @@ class GenericIE(InfoExtractor):
                  'id': '389508',
                  'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
                  'ext': 'mp4',
-                'title': 'Syren De Mer  onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
-                'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
-            }
+                'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+                'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+            },
          },
          {
              # Reddit-hosted video that will redirect and be processed by RedditIE
@@ -2154,7 +2124,52 @@ class GenericIE(InfoExtractor):
                  'age_limit': 0,
                  'direct': True,
              }
-        }
+        },
+        {
+            'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.',
+            'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+            'info_dict': {
+                'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+                'ext': 'mp4',
+                'title': 'čauky lidi 70 finall',
+                'description': 'čauky lidi 70 finall',
+                'thumbnail': 'h',
+                'upload_date': '20220606',
+                'timestamp': 1654513791,
+                'duration': 318.0,
+                'direct': True,
+                'age_limit': 0,
+            },
+        },
+        {
+            'note': 'JW Player embed with unicode-escape sequences in URL',
+            'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
+            'info_dict': {
+                'id': 'm',
+                'ext': 'mp4',
+                'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
+                'description': 'Mahler\'s ',
+                'uploader': 'www.medici.tv',
+                'age_limit': 0,
+                'thumbnail': r're:^https?://.+\.jpg',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+            'md5': 'e2f0a4c329f7986280b7328e24036d60',
+            'info_dict': {
+                'id': '284002',
+                'display_id': 'just-out-of-the-shower-joi',
+                'ext': 'mp4',
+                'title': 'Just Out Of The Shower JOI - Shooshtime',
+                'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
+                'height': 720,
+                'age_limit': 18,
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -2171,12 +2186,21 @@ def report_detected(self, name, num=1, note=None):
  
          self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
  
-    def _fragment_query(self, url):
+    def _extra_manifest_info(self, info, manifest_url):
          if self._configuration_arg('fragment_query'):
-            query_string = urllib.parse.urlparse(url).query
+            query_string = urllib.parse.urlparse(manifest_url).query
              if query_string:
-                return {'extra_param_to_segment_url': query_string}
-        return {}
+                info['extra_param_to_segment_url'] = query_string
+
+        hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
+        info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key'), {
+            'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
+        }) or None
+
+        if self._configuration_arg('variant_query'):
+            query = parse_qs(manifest_url)
+            for fmt in self._downloader._get_formats(info):
+                fmt['url'] = update_url_query(fmt['url'], query)
  
      def _extract_rss(self, url, video_id, doc):
          NS_MAP = {
@@ -2220,43 +2244,87 @@ def itunes(key):
              'entries': entries,
          }
  
-    def _kvs_getrealurl(self, video_url, license_code):
+    @classmethod
+    def _kvs_get_real_url(cls, video_url, license_code):
          if not video_url.startswith('function/0/'):
              return video_url  # not obfuscated
  
-        url_path, _, url_query = video_url.partition('?')
-        urlparts = url_path.split('/')[2:]
-        license = self._kvs_getlicensetoken(license_code)
-        newmagic = urlparts[5][:32]
+        parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
+        license = cls._kvs_get_license_token(license_code)
+        urlparts = parsed.path.split('/')
  
-        for o in range(len(newmagic) - 1, -1, -1):
-            new = ''
-            l = (o + sum(int(n) for n in license[o:])) % 32
+        HASH_LENGTH = 32
+        hash = urlparts[3][:HASH_LENGTH]
+        indices = list(range(HASH_LENGTH))
  
-            for i in range(0, len(newmagic)):
-                if i == o:
-                    new += newmagic[l]
-                elif i == l:
-                    new += newmagic[o]
-                else:
-                    new += newmagic[i]
-            newmagic = new
+        # Swap indices of hash according to the destination calculated from the license token
+        accum = 0
+        for src in reversed(range(HASH_LENGTH)):
+            accum += license[src]
+            dest = (src + accum) % HASH_LENGTH
+            indices[src], indices[dest] = indices[dest], indices[src]
  
-        urlparts[5] = newmagic + urlparts[5][32:]
-        return '/'.join(urlparts) + '?' + url_query
+        urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
+        return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
  
-    def _kvs_getlicensetoken(self, license):
-        modlicense = license.replace('$', '').replace('0', '1')
-        center = int(len(modlicense) / 2)
+    @staticmethod
+    def _kvs_get_license_token(license):
+        license = license.replace('$', '')
+        license_values = [int(char) for char in license]
+
+        modlicense = license.replace('0', '1')
+        center = len(modlicense) // 2
          fronthalf = int(modlicense[:center + 1])
          backhalf = int(modlicense[center:])
+        modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
+
+        return [
+            (license_values[index + offset] + current) % 10
+            for index, current in enumerate(map(int, modlicense))
+            for offset in range(4)
+        ]
+
+    def _extract_kvs(self, url, webpage, video_id):
+        flashvars = self._search_json(
+            r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
+            webpage, 'flashvars', video_id, transform_source=js_to_json)
+
+        # extract the part after the last / as the display_id from the
+        # canonical URL.
+        display_id = self._search_regex(
+            r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+            r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+            webpage, 'display_id', fatal=False)
+        title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+        thumbnail = flashvars['preview_url']
+        if thumbnail.startswith('//'):
+            protocol, _, _ = url.partition('/')
+            thumbnail = protocol + thumbnail
+
+        url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
+        formats = []
+        for key in url_keys:
+            if '/get_file/' not in flashvars[key]:
+                continue
+            format_id = flashvars.get(f'{key}_text', key)
+            formats.append({
+                'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
+                'format_id': format_id,
+                'ext': 'mp4',
+                **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
+                'http_headers': {'Referer': url},
+            })
+            if not formats[-1].get('height'):
+                formats[-1]['quality'] = 1
  
-        modlicense = str(4 * abs(fronthalf - backhalf))
-        retval = ''
-        for o in range(0, center + 1):
-            for i in range(1, 5):
-                retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
-        return retval
+        return {
+            'id': flashvars['video_id'],
+            'display_id': display_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
  
      def _real_extract(self, url):
          if url.startswith('//'):
@@ -2312,13 +2380,12 @@ def _real_extract(self, url):
          # It may probably better to solve this by checking Content-Type for application/octet-stream
          # after a HEAD request, but not sure if we can rely on this.
          full_response = self._request_webpage(url, video_id, headers={
-            'Accept-Encoding': '*',
+            'Accept-Encoding': 'identity',
              **smuggled_data.get('http_headers', {})
          })
          new_url = full_response.geturl()
-        if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl():
-            url = new_url
-        elif url != new_url:
+        url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
+        if new_url != extract_basic_auth(url)[0]:
              self.report_following_redirect(new_url)
              if force_videoid:
                  new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
@@ -2337,14 +2404,13 @@ def _real_extract(self, url):
              self.report_detected('direct video link')
              headers = smuggled_data.get('http_headers', {})
              format_id = str(m.group('format_id'))
+            ext = determine_ext(url)
              subtitles = {}
-            if format_id.endswith('mpegurl'):
+            if format_id.endswith('mpegurl') or ext == 'm3u8':
                  formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
-                info_dict.update(self._fragment_query(url))
-            elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
+            elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd':
                  formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
-                info_dict.update(self._fragment_query(url))
-            elif format_id == 'f4m':
+            elif format_id == 'f4m' or ext == 'f4m':
                  formats = self._extract_f4m_formats(url, video_id, headers=headers)
              else:
                  formats = [{
@@ -2358,6 +2424,7 @@ def _real_extract(self, url):
                  'subtitles': subtitles,
                  'http_headers': headers or None,
              })
+            self._extra_manifest_info(info_dict, url)
              return info_dict
  
          if not self.get_param('test', False) and not is_intentional:
@@ -2370,7 +2437,7 @@ def _real_extract(self, url):
          if first_bytes.startswith(b'#EXTM3U'):
              self.report_detected('M3U playlist')
              info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
-            info_dict.update(self._fragment_query(url))
+            self._extra_manifest_info(info_dict, url)
              return info_dict
  
          # Maybe it's a direct link to a video?
@@ -2421,7 +2488,7 @@ def _real_extract(self, url):
                      doc,
                      mpd_base_url=full_response.geturl().rpartition('/')[0],
                      mpd_url=url)
-                info_dict.update(self._fragment_query(url))
+                self._extra_manifest_info(info_dict, url)
                  self.report_detected('DASH manifest')
                  return info_dict
              elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@@ -2535,7 +2602,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                      formats.extend(fmts)
                      self._merge_subtitles(subs, target=subtitles)
                  for fmt in formats:
-                    fmt.update(self._fragment_query(src))
+                    self._extra_manifest_info(fmt, src)
  
                  if not formats:
                      formats.append({
@@ -2565,6 +2632,17 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                  self.report_detected('video.js embed')
                  return [{'formats': formats, 'subtitles': subtitles}]
  
+        # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
+        found = self._search_regex((
+            r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
+            r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
+        ), webpage, 'KVS player', group='ver', default=False)
+        if found:
+            self.report_detected('KVS Player')
+            if found.split('.')[0] not in ('4', '5', '6'):
+                self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
+            return [self._extract_kvs(url, webpage, video_id)]
+
          # Looking for http://schema.org/VideoObject
          json_ld = self._search_json_ld(webpage, video_id, default={})
          if json_ld.get('url') not in (url, None):
@@ -2607,52 +2685,6 @@ def filter_video(urls):
                  ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
              if found:
                  self.report_detected('JW Player embed')
-        if not found:
-            # Look for generic KVS player
-            found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
-            if found:
-                self.report_detected('KWS Player')
-                if found.group('maj_ver') not in ['4', '5']:
-                    self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
-                flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
-                flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
-
-                # extract the part after the last / as the display_id from the
-                # canonical URL.
-                display_id = self._search_regex(
-                    r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
-                    r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
-                    webpage, 'display_id', fatal=False
-                )
-                title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
-
-                thumbnail = flashvars['preview_url']
-                if thumbnail.startswith('//'):
-                    protocol, _, _ = url.partition('/')
-                    thumbnail = protocol + thumbnail
-
-                url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
-                formats = []
-                for key in url_keys:
-                    if '/get_file/' not in flashvars[key]:
-                        continue
-                    format_id = flashvars.get(f'{key}_text', key)
-                    formats.append({
-                        'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
-                        'format_id': format_id,
-                        'ext': 'mp4',
-                        **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
-                    })
-                    if not formats[-1].get('height'):
-                        formats[-1]['quality'] = 1
-
-                return [{
-                    'id': flashvars['video_id'],
-                    'display_id': display_id,
-                    'title': title,
-                    'thumbnail': thumbnail,
-                    'formats': formats,
-                }]
          if not found:
              # Broaden the search a little bit
              found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
@@ -2733,6 +2765,7 @@ def filter_video(urls):
  
          entries = []
          for video_url in orderedSet(found):
+            video_url = video_url.encode().decode('unicode-escape')
              video_url = unescapeHTML(video_url)
              video_url = video_url.replace('\\/', '/')
              video_url = urllib.parse.urljoin(url, video_url)
@@ -2772,10 +2805,10 @@ def filter_video(urls):
                  return [self._extract_xspf_playlist(video_url, video_id)]
              elif ext == 'm3u8':
                  entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
-                entry_info_dict.update(self._fragment_query(video_url))
+                self._extra_manifest_info(entry_info_dict, video_url)
              elif ext == 'mpd':
                  entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
-                entry_info_dict.update(self._fragment_query(video_url))
+                self._extra_manifest_info(entry_info_dict, video_url)
              elif ext == 'f4m':
                  entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
              elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: