Fix bug in db3ad8a67661d7b234a6954d9c6a4a9b1749f5eb

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index ebacc87bc00473136ef0ac4ca1432f12094af54c..2ea36c63da6b3cb2d938893504f71afcebfea854 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -13,9 +13,11 @@
  import os
  import random
  import re
+import subprocess
  import sys
  import time
  import types
+import urllib.error
  import urllib.parse
  import urllib.request
  import xml.etree.ElementTree
@@ -34,6 +36,7 @@
      GeoUtils,
      HEADRequest,
      LenientJSONDecoder,
+    Popen,
      RegexNotFoundError,
      RetryManager,
      UnsupportedError,
@@ -56,6 +59,7 @@
      join_nonempty,
      js_to_json,
      mimetype2ext,
+    netrc_from_content,
      network_exceptions,
      orderedSet,
      parse_bitrate,
@@ -132,6 +136,7 @@ class InfoExtractor:
                                         is parsed from a string (in case of
                                         fragmented media)
                                     for MSS - URL of the ISM manifest.
+                    * request_data  Data to send in POST request to the URL
                      * manifest_url
                                   The URL of the manifest file in case of
                                   fragmented media:
@@ -285,6 +290,7 @@ class InfoExtractor:
      channel_id:     Id of the channel.
      channel_url:    Full URL to a channel webpage.
      channel_follower_count: Number of followers of the channel.
+    channel_is_verified: Whether the channel is verified on the platform.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
                      {tag: subformats}. "tag" is usually a language code, and
@@ -313,6 +319,11 @@ class InfoExtractor:
                          * "author" - human-readable name of the comment author
                          * "author_id" - user ID of the comment author
                          * "author_thumbnail" - The thumbnail of the comment author
+                        * "author_url" - The url to the comment author's page
+                        * "author_is_verified" - Whether the author is verified
+                                                 on the platform
+                        * "author_is_uploader" - Whether the comment is made by
+                                                 the video uploader
                          * "id" - Comment ID
                          * "html" - Comment as HTML
                          * "text" - Plain text of the comment
@@ -324,8 +335,8 @@ class InfoExtractor:
                          * "dislike_count" - Number of negative ratings of the comment
                          * "is_favorited" - Whether the comment is marked as
                                             favorite by the video uploader
-                        * "author_is_uploader" - Whether the comment is made by
-                                                 the video uploader
+                        * "is_pinned" - Whether the comment is pinned to
+                                        the top of the comments
      age_limit:      Age restriction for the video, as an integer (years)
      webpage_url:    The URL to the video webpage, if given to yt-dlp it
                      should allow to get the same result again. (It will be set
@@ -349,6 +360,10 @@ class InfoExtractor:
                          * "start_time" - The start time of the chapter in seconds
                          * "end_time" - The end time of the chapter in seconds
                          * "title" (optional, string)
+    heatmap:        A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the data point in seconds
+                        * "end_time" - The end time of the data point in seconds
+                        * "value" - The normalized value of the data point (float between 0 and 1)
      playable_in_embed: Whether this video is allowed to play in embedded
                      players on other sites. Can be True (=always allowed),
                      False (=never allowed), None (=unknown), or a string
@@ -524,7 +539,7 @@ class InfoExtractor:
      _EMBED_REGEX = []
  
      def _login_hint(self, method=NO_DEFAULT, netrc=None):
-        password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+        password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
          return {
              None: '',
              'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
@@ -1280,45 +1295,48 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
          return clean_html(res)
  
      def _get_netrc_login_info(self, netrc_machine=None):
-        username = None
-        password = None
          netrc_machine = netrc_machine or self._NETRC_MACHINE
  
-        if self.get_param('usenetrc', False):
-            try:
-                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
-                if os.path.isdir(netrc_file):
-                    netrc_file = os.path.join(netrc_file, '.netrc')
-                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError(
-                        'No authenticators for %s' % netrc_machine)
-            except (OSError, netrc.NetrcParseError) as err:
-                self.report_warning(
-                    'parsing .netrc: %s' % error_to_compat_str(err))
+        cmd = self.get_param('netrc_cmd')
+        if cmd:
+            cmd = cmd.replace('{}', netrc_machine)
+            self.to_screen(f'Executing command: {cmd}')
+            stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+            if ret != 0:
+                raise OSError(f'Command returned error code {ret}')
+            info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+        elif self.get_param('usenetrc', False):
+            netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+            if os.path.isdir(netrc_file):
+                netrc_file = os.path.join(netrc_file, '.netrc')
+            info = netrc.netrc(netrc_file).authenticators(netrc_machine)
  
-        return username, password
+        else:
+            return None, None
+        if not info:
+            raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
+        return info[0], info[2]
  
      def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
          """
          Get the login info as (username, password)
          First look for the manually specified credentials using username_option
          and password_option as keys in params dictionary. If no such credentials
-        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
-        value.
+        are available try the netrc_cmd if it is defined or look in the
+        netrc file using the netrc_machine or _NETRC_MACHINE value.
          If there's no info available, return (None, None)
          """
  
-        # Attempt to use provided username and password or .netrc data
          username = self.get_param(username_option)
          if username is not None:
              password = self.get_param(password_option)
          else:
-            username, password = self._get_netrc_login_info(netrc_machine)
-
+            try:
+                username, password = self._get_netrc_login_info(netrc_machine)
+            except (OSError, netrc.NetrcParseError) as err:
+                self.report_warning(f'Failed to parse .netrc: {err}')
+                return None, None
          return username, password
  
      def _get_tfa_info(self, note='two-factor verification code'):
@@ -2063,6 +2081,7 @@ def extract_media(x_media_line):
                      'protocol': entry_protocol,
                      'preference': preference,
                      'quality': quality,
+                    'has_drm': has_drm,
                      'vcodec': 'none' if media_type == 'AUDIO' else None,
                  } for idx in _extract_m3u8_playlist_indices(manifest_url))
  
@@ -2122,6 +2141,7 @@ def build_stream_name():
                          'protocol': entry_protocol,
                          'preference': preference,
                          'quality': quality,
+                        'has_drm': has_drm,
                      }
                      resolution = last_stream_inf.get('RESOLUTION')
                      if resolution:
@@ -2980,6 +3000,8 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                          'protocol': 'ism',
                          'fragments': fragments,
                          'has_drm': ism_doc.find('Protection') is not None,
+                        'language': stream_language,
+                        'audio_channels': int_or_none(track.get('Channels')),
                          '_download_params': {
                              'stream_type': stream_type,
                              'duration': duration,
@@ -3435,7 +3457,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
  
      def _get_cookies(self, url):
          """ Return a http.cookies.SimpleCookie with the cookies for the url """
-        return LenientSimpleCookie(self._downloader._calc_cookies(url))
+        return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
  
      def _apply_first_set_cookie_header(self, url_handle, cookie):
          """
@@ -3510,8 +3532,8 @@ def _RETURN_TYPE(cls):
      @classmethod
      def is_single_video(cls, url):
          """Returns whether the URL is of a single video, None if unknown"""
-        assert cls.suitable(url), 'The URL must be suitable for the extractor'
-        return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
+        if cls.suitable(url):
+            return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
  
      @classmethod
      def is_suitable(cls, age_limit):
@@ -3524,7 +3546,7 @@ def description(cls, *, markdown=True, search_examples=None):
          desc = ''
          if cls._NETRC_MACHINE:
              if markdown:
-                desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+                desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
              else:
                  desc += f' [{cls._NETRC_MACHINE}]'
          if cls.IE_DESC is False:
@@ -3646,6 +3668,42 @@ def _generic_title(self, url='', webpage='', *, default=None):
                  or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
                  or default)
  
+    def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
+        if not duration:
+            return
+        chapter_list = [{
+            'start_time': start_function(chapter),
+            'title': title_function(chapter),
+        } for chapter in chapter_list or []]
+        if strict:
+            warn = self.report_warning
+        else:
+            warn = self.write_debug
+            chapter_list.sort(key=lambda c: c['start_time'] or 0)
+
+        chapters = [{'start_time': 0}]
+        for idx, chapter in enumerate(chapter_list):
+            if chapter['start_time'] is None:
+                warn(f'Incomplete chapter {idx}')
+            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
+                chapters.append(chapter)
+            elif chapter not in chapters:
+                issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+                         else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+                warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
+        return chapters[1:]
+
+    def _extract_chapters_from_description(self, description, duration):
+        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
+        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
+        return self._extract_chapters_helper(
+            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
+            start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
+            duration=duration, strict=False) or self._extract_chapters_helper(
+            re.findall(sep_re % (r'.+?', duration_re), description or ''),
+            start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
+            duration=duration, strict=False)
+
      @staticmethod
      def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
          all_known = all(map(