[core] Allow extractors to mark formats as potentially DRM (#7396)

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 838899052c762b29702a398e64cb61862b23349c..fe08839aaa6c484e91c4b728c040685ebc824adf 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -13,9 +13,11 @@
  import os
  import random
  import re
+import subprocess
  import sys
  import time
  import types
+import urllib.error
  import urllib.parse
  import urllib.request
  import xml.etree.ElementTree
@@ -24,6 +26,7 @@
  from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
  from ..cookies import LenientSimpleCookie
  from ..downloader.f4m import get_base_url, remove_encrypted_media
+from ..downloader.hls import HlsFD
  from ..utils import (
      IDENTITY,
      JSON_LD_RE,
@@ -34,6 +37,7 @@
      GeoUtils,
      HEADRequest,
      LenientJSONDecoder,
+    Popen,
      RegexNotFoundError,
      RetryManager,
      UnsupportedError,
@@ -56,6 +60,7 @@
      join_nonempty,
      js_to_json,
      mimetype2ext,
+    netrc_from_content,
      network_exceptions,
      orderedSet,
      parse_bitrate,
@@ -220,7 +225,8 @@ class InfoExtractor:
                                   width : height ratio as float.
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
-                    * has_drm    The format has DRM and cannot be downloaded. Boolean
+                    * has_drm    True if the format has DRM and cannot be downloaded.
+                                 'maybe' if the format may have DRM and has to be tested before download.
                      * extra_param_to_segment_url  A query string to append to each
                                   fragment's URL, or to update each existing query string
                                   with. Only applied by the native HLS/DASH downloaders.
@@ -286,6 +292,7 @@ class InfoExtractor:
      channel_id:     Id of the channel.
      channel_url:    Full URL to a channel webpage.
      channel_follower_count: Number of followers of the channel.
+    channel_is_verified: Whether the channel is verified on the platform.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
                      {tag: subformats}. "tag" is usually a language code, and
@@ -314,6 +321,11 @@ class InfoExtractor:
                          * "author" - human-readable name of the comment author
                          * "author_id" - user ID of the comment author
                          * "author_thumbnail" - The thumbnail of the comment author
+                        * "author_url" - The url to the comment author's page
+                        * "author_is_verified" - Whether the author is verified
+                                                 on the platform
+                        * "author_is_uploader" - Whether the comment is made by
+                                                 the video uploader
                          * "id" - Comment ID
                          * "html" - Comment as HTML
                          * "text" - Plain text of the comment
@@ -325,8 +337,8 @@ class InfoExtractor:
                          * "dislike_count" - Number of negative ratings of the comment
                          * "is_favorited" - Whether the comment is marked as
                                             favorite by the video uploader
-                        * "author_is_uploader" - Whether the comment is made by
-                                                 the video uploader
+                        * "is_pinned" - Whether the comment is pinned to
+                                        the top of the comments
      age_limit:      Age restriction for the video, as an integer (years)
      webpage_url:    The URL to the video webpage, if given to yt-dlp it
                      should allow to get the same result again. (It will be set
@@ -350,6 +362,10 @@ class InfoExtractor:
                          * "start_time" - The start time of the chapter in seconds
                          * "end_time" - The end time of the chapter in seconds
                          * "title" (optional, string)
+    heatmap:        A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the data point in seconds
+                        * "end_time" - The end time of the data point in seconds
+                        * "value" - The normalized value of the data point (float between 0 and 1)
      playable_in_embed: Whether this video is allowed to play in embedded
                      players on other sites. Can be True (=always allowed),
                      False (=never allowed), None (=unknown), or a string
@@ -461,8 +477,8 @@ class InfoExtractor:
  
  
      Subclasses of this should also be added to the list of extractors and
-    should define a _VALID_URL regexp and, re-define the _real_extract() and
-    (optionally) _real_initialize() methods.
+    should define _VALID_URL as a regexp or a Sequence of regexps, and
+    re-define the _real_extract() and (optionally) _real_initialize() methods.
  
      Subclasses may also override suitable() if necessary, but ensure the function
      signature is preserved and that this function imports everything it needs
@@ -525,7 +541,7 @@ class InfoExtractor:
      _EMBED_REGEX = []
  
      def _login_hint(self, method=NO_DEFAULT, netrc=None):
-        password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+        password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
          return {
              None: '',
              'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
@@ -552,8 +568,8 @@ def _match_valid_url(cls, url):
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
-            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-        return cls._VALID_URL_RE.match(url)
+            cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+        return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
  
      @classmethod
      def suitable(cls, url):
@@ -1281,45 +1297,48 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
          return clean_html(res)
  
      def _get_netrc_login_info(self, netrc_machine=None):
-        username = None
-        password = None
          netrc_machine = netrc_machine or self._NETRC_MACHINE
  
-        if self.get_param('usenetrc', False):
-            try:
-                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
-                if os.path.isdir(netrc_file):
-                    netrc_file = os.path.join(netrc_file, '.netrc')
-                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError(
-                        'No authenticators for %s' % netrc_machine)
-            except (OSError, netrc.NetrcParseError) as err:
-                self.report_warning(
-                    'parsing .netrc: %s' % error_to_compat_str(err))
+        cmd = self.get_param('netrc_cmd')
+        if cmd:
+            cmd = cmd.replace('{}', netrc_machine)
+            self.to_screen(f'Executing command: {cmd}')
+            stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+            if ret != 0:
+                raise OSError(f'Command returned error code {ret}')
+            info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+        elif self.get_param('usenetrc', False):
+            netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+            if os.path.isdir(netrc_file):
+                netrc_file = os.path.join(netrc_file, '.netrc')
+            info = netrc.netrc(netrc_file).authenticators(netrc_machine)
  
-        return username, password
+        else:
+            return None, None
+        if not info:
+            raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
+        return info[0], info[2]
  
      def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
          """
          Get the login info as (username, password)
          First look for the manually specified credentials using username_option
          and password_option as keys in params dictionary. If no such credentials
-        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
-        value.
+        are available try the netrc_cmd if it is defined or look in the
+        netrc file using the netrc_machine or _NETRC_MACHINE value.
          If there's no info available, return (None, None)
          """
  
-        # Attempt to use provided username and password or .netrc data
          username = self.get_param(username_option)
          if username is not None:
              password = self.get_param(password_option)
          else:
-            username, password = self._get_netrc_login_info(netrc_machine)
-
+            try:
+                username, password = self._get_netrc_login_info(netrc_machine)
+            except (OSError, netrc.NetrcParseError) as err:
+                self.report_warning(f'Failed to parse .netrc: {err}')
+                return None, None
          return username, password
  
      def _get_tfa_info(self, note='two-factor verification code'):
@@ -1962,11 +1981,7 @@ def _parse_m3u8_formats_and_subtitles(
              errnote=None, fatal=True, data=None, headers={}, query={},
              video_id=None):
          formats, subtitles = [], {}
-
-        has_drm = re.search('|'.join([
-            r'#EXT-X-FAXS-CM:',  # Adobe Flash Access
-            r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://',  # Apple FairPlay
-        ]), m3u8_doc)
+        has_drm = HlsFD._has_drm(m3u8_doc)
  
          def format_url(url):
              return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
@@ -3440,7 +3455,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
  
      def _get_cookies(self, url):
          """ Return a http.cookies.SimpleCookie with the cookies for the url """
-        return LenientSimpleCookie(self._downloader._calc_cookies(url))
+        return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
  
      def _apply_first_set_cookie_header(self, url_handle, cookie):
          """
@@ -3658,18 +3673,22 @@ def _extract_chapters_helper(self, chapter_list, start_function, title_function,
              'start_time': start_function(chapter),
              'title': title_function(chapter),
          } for chapter in chapter_list or []]
-        if not strict:
+        if strict:
+            warn = self.report_warning
+        else:
+            warn = self.write_debug
              chapter_list.sort(key=lambda c: c['start_time'] or 0)
  
          chapters = [{'start_time': 0}]
          for idx, chapter in enumerate(chapter_list):
              if chapter['start_time'] is None:
-                self.report_warning(f'Incomplete chapter {idx}')
+                warn(f'Incomplete chapter {idx}')
              elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
                  chapters.append(chapter)
              elif chapter not in chapters:
-                self.report_warning(
-                    f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
+                issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+                         else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+                warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
          return chapters[1:]
  
      def _extract_chapters_from_description(self, description, duration):