[cleanup] Misc cleanup

[yt-dlp.git] / yt_dlp / extractor / common.py
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 9cc3051c462d2a021e163ef4cbaa9181e385aa93..65444d3bf3672ca9a76d60a27a4e0128faef2382 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -18,6 +18,7 @@
      compat_cookies_SimpleCookie,
      compat_etree_Element,
      compat_etree_fromstring,
+    compat_expanduser,
      compat_getpass,
      compat_http_client,
      compat_os_name,
@@ -203,6 +204,7 @@ class InfoExtractor(object):
                                   width : height ratio as float.
                      * no_resume  The server does not support resuming the
                                   (HTTP or RTMP) download. Boolean.
+                    * has_drm    The format has DRM and cannot be downloaded. Boolean
                      * downloader_options  A dictionary of downloader options as
                                   described in FileDownloader
                      RTMP formats can also have the additional fields: page_url,
@@ -404,6 +406,10 @@ class InfoExtractor(object):
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
  
+    Subclasses may also override suitable() if necessary, but ensure the function
+    signature is preserved and that this function imports everything it needs
+    (except other extractors), so that lazy_extractors works correctly
+
      _GEO_BYPASS attribute may be set to False in order to disable
      geo restriction bypass mechanisms for a particular extractor.
      Though it won't disable explicit geo restriction bypass based on
@@ -419,7 +425,7 @@ class InfoExtractor(object):
      will be used by geo restriction bypass mechanism similarly
      to _GEO_COUNTRIES.
  
-    Finally, the _WORKING attribute should be set to False for broken IEs
+    The _WORKING attribute should be set to False for broken IEs
      in order to warn the users and skip the tests.
      """
  
@@ -447,23 +453,31 @@ def __init__(self, downloader=None):
          self.set_downloader(downloader)
  
      @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-
+    def _match_valid_url(cls, url):
          # This does not use has/getattr intentionally - we want to know whether
          # we have cached the regexp for *this* class, whereas getattr would also
          # match the superclass
          if '_VALID_URL_RE' not in cls.__dict__:
              cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-        return cls._VALID_URL_RE.match(url) is not None
+        return cls._VALID_URL_RE.match(url)
+
+    @classmethod
+    def suitable(cls, url):
+        """Receives a URL and returns True if suitable for this IE."""
+        # This function must import everything it needs (except other extractors),
+        # so that lazy_extractors works correctly
+        return cls._match_valid_url(url) is not None
  
      @classmethod
      def _match_id(cls, url):
-        if '_VALID_URL_RE' not in cls.__dict__:
-            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-        m = cls._VALID_URL_RE.match(url)
-        assert m
-        return compat_str(m.group('id'))
+        return cls._match_valid_url(url).group('id')
+
+    @classmethod
+    def get_temp_id(cls, url):
+        try:
+            return cls._match_id(url)
+        except (IndexError, AttributeError):
+            return None
  
      @classmethod
      def working(cls):
@@ -586,12 +600,14 @@ def extract(self, url):
                      if self.__maybe_fake_ip_and_retry(e.countries):
                          continue
                      raise
-        except ExtractorError:
-            raise
+        except ExtractorError as e:
+            video_id = e.video_id or self.get_temp_id(url)
+            raise ExtractorError(
+                e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
          except compat_http_client.IncompleteRead as e:
-            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
+            raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
          except (KeyError, StopIteration) as e:
-            raise ExtractorError('An extractor error has occurred.', cause=e)
+            raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
  
      def __maybe_fake_ip_and_retry(self, countries):
          if (not self.get_param('geo_bypass_country', None)
@@ -623,7 +639,7 @@ def _real_extract(self, url):
      @classmethod
      def ie_key(cls):
          """A string for getting the InfoExtractor with get_info_extractor"""
-        return compat_str(cls.__name__[:-2])
+        return cls.__name__[:-2]
  
      @property
      def IE_NAME(self):
@@ -777,9 +793,10 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
              self._downloader.to_screen(dump)
          if self.get_param('write_pages', False):
              basen = '%s_%s' % (video_id, urlh.geturl())
-            if len(basen) > 240:
+            trim_length = self.get_param('trim_file_name') or 240
+            if len(basen) > trim_length:
                  h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
-                basen = basen[:240 - len(h)] + h
+                basen = basen[:trim_length - len(h)] + h
              raw_filename = basen + '.dump'
              filename = sanitize_filename(raw_filename, restricted=True)
              self.to_screen('Saving request to ' + filename)
@@ -1023,6 +1040,9 @@ def get_param(self, name, default=None, *args, **kwargs):
              return self._downloader.params.get(name, default, *args, **kwargs)
          return default
  
+    def report_drm(self, video_id, partial=False):
+        self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
+
      def report_extraction(self, id_or_name):
          """Report information extraction."""
          self.to_screen('%s: Extracting information' % id_or_name)
@@ -1114,10 +1134,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
                  if mobj:
                      break
  
-        if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
-            _name = '\033[0;34m%s\033[0m' % name
-        else:
-            _name = name
+        _name = self._downloader._color_text(name, 'blue')
  
          if mobj:
              if group is None:
@@ -1152,7 +1169,10 @@ def _get_netrc_login_info(self, netrc_machine=None):
  
          if self.get_param('usenetrc', False):
              try:
-                info = netrc.netrc().authenticators(netrc_machine)
+                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+                if os.path.isdir(netrc_file):
+                    netrc_file = os.path.join(netrc_file, '.netrc')
+                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
                  if info is not None:
                      username = info[0]
                      password = info[2]
@@ -1488,7 +1508,7 @@ class FormatSort:
          default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
                     'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
                     'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
-        ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
+        ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
                          'height', 'width', 'proto', 'vext', 'abr', 'aext',
                          'fps', 'fs_approx', 'source', 'format_id')
  
@@ -1512,7 +1532,7 @@ class FormatSort:
              'ie_pref': {'priority': True, 'type': 'extractor'},
              'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
              'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
-            'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
+            'lang': {'convert': 'ignore', 'field': 'language_preference'},
              'quality': {'convert': 'float_none', 'default': -1},
              'filesize': {'convert': 'bytes'},
              'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
@@ -1658,7 +1678,7 @@ def add_item(field, reverse, closest, limit_text):
                  has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
  
                  fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
-                limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
+                limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
                  limit_count = len(limits)
                  for (i, f) in enumerate(fields):
                      add_item(f, reverse, closest,
@@ -1742,18 +1762,16 @@ def calculate_preference(self, format):
                  if format.get('vbr') is not None and format.get('abr') is not None:
                      format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
              else:
-                if format.get('vcodec') != "none" and format.get('vbr') is None:
+                if format.get('vcodec') != 'none' and format.get('vbr') is None:
                      format['vbr'] = format.get('tbr') - format.get('abr', 0)
-                if format.get('acodec') != "none" and format.get('abr') is None:
+                if format.get('acodec') != 'none' and format.get('abr') is None:
                      format['abr'] = format.get('tbr') - format.get('vbr', 0)
  
              return tuple(self._calculate_field_preference(format, field) for field in self._order)
  
      def _sort_formats(self, formats, field_preference=[]):
          if not formats:
-            if self.get_param('ignore_no_formats_error'):
-                return
-            raise ExtractorError('No video formats found')
+            return
          format_sort = self.FormatSort()  # params and to_screen are taken from the downloader
          format_sort.evaluate_params(self._downloader.params, field_preference)
          if self.get_param('verbose', False):
@@ -1948,13 +1966,16 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m
              'format_note': 'Quality selection URL',
          }
  
+    def _report_ignoring_subs(self, name):
+        self.report_warning(bug_reports_message(
+            f'Ignoring subtitle tracks found in the {name} manifest; '
+            'if any subtitle tracks are missing,'
+        ), only_once=True)
+
      def _extract_m3u8_formats(self, *args, **kwargs):
          fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the HLS manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('HLS')
          return fmts
  
      def _extract_m3u8_formats_and_subtitles(
@@ -1991,9 +2012,7 @@ def _parse_m3u8_formats_and_subtitles(
          if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
              return formats, subtitles
  
-        if (not self.get_param('allow_unplayable_formats')
-                and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)):  # Apple FairPlay
-            return formats, subtitles
+        has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
  
          def format_url(url):
              return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
@@ -2039,6 +2058,7 @@ def _extract_m3u8_playlist_indices(*args, **kwargs):
                  'protocol': entry_protocol,
                  'preference': preference,
                  'quality': quality,
+                'has_drm': has_drm,
              } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
  
              return formats, subtitles
@@ -2203,6 +2223,25 @@ def build_stream_name():
                  last_stream_inf = {}
          return formats, subtitles
  
+    def _extract_m3u8_vod_duration(
+            self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+        m3u8_vod = self._download_webpage(
+            m3u8_vod_url, video_id,
+            note='Downloading m3u8 VOD manifest' if note is None else note,
+            errnote='Failed to download VOD manifest' if errnote is None else errnote,
+            fatal=False, data=data, headers=headers, query=query)
+
+        return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
+
+    def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
+        if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+            return None
+
+        return int(sum(
+            float(line[len('#EXTINF:'):].split(',')[0])
+            for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
+
      @staticmethod
      def _xpath_ns(path, namespace=None):
          if not namespace:
@@ -2234,10 +2273,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4
      def _extract_smil_formats(self, *args, **kwargs):
          fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the SMIL manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('SMIL')
          return fmts
  
      def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
@@ -2307,14 +2343,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
          rtmp_count = 0
          http_count = 0
          m3u8_count = 0
+        imgs_count = 0
  
-        srcs = []
+        srcs = set()
          media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
          for medium in media:
              src = medium.get('src')
              if not src or src in srcs:
                  continue
-            srcs.append(src)
+            srcs.add(src)
  
              bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
              filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
@@ -2388,6 +2425,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                      'height': height,
                  })
  
+        for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
+            src = medium.get('src')
+            if not src or src in srcs:
+                continue
+            srcs.add(src)
+
+            imgs_count += 1
+            formats.append({
+                'format_id': 'imagestream-%d' % (imgs_count),
+                'url': src,
+                'ext': mimetype2ext(medium.get('type')),
+                'acodec': 'none',
+                'vcodec': 'none',
+                'width': int_or_none(medium.get('width')),
+                'height': int_or_none(medium.get('height')),
+                'format_note': 'SMIL storyboards',
+            })
+
          return formats
  
      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
@@ -2460,10 +2515,7 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
      def _extract_mpd_formats(self, *args, **kwargs):
          fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the DASH manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('DASH')
          return fmts
  
      def _extract_mpd_formats_and_subtitles(
@@ -2487,10 +2539,7 @@ def _extract_mpd_formats_and_subtitles(
      def _parse_mpd_formats(self, *args, **kwargs):
          fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the DASH manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('DASH')
          return fmts
  
      def _parse_mpd_formats_and_subtitles(
@@ -2572,11 +2621,9 @@ def extract_Initialization(source):
                          extract_Initialization(segment_template)
              return ms_info
  
-        skip_unplayable = not self.get_param('allow_unplayable_formats')
-
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
-        formats = []
-        subtitles = {}
+        formats, subtitles = [], {}
+        stream_numbers = {'audio': 0, 'video': 0}
          for period in mpd_doc.findall(_add_ns('Period')):
              period_duration = parse_duration(period.get('duration')) or mpd_duration
              period_ms_info = extract_multisegment_info(period, {
@@ -2584,12 +2631,8 @@ def extract_Initialization(source):
                  'timescale': 1,
              })
              for adaptation_set in period.findall(_add_ns('AdaptationSet')):
-                if skip_unplayable and is_drm_protected(adaptation_set):
-                    continue
                  adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
                  for representation in adaptation_set.findall(_add_ns('Representation')):
-                    if skip_unplayable and is_drm_protected(representation):
-                        continue
                      representation_attrib = adaptation_set.attrib.copy()
                      representation_attrib.update(representation.attrib)
                      # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
@@ -2599,8 +2642,8 @@ def extract_Initialization(source):
                      codecs = representation_attrib.get('codecs', '')
                      if content_type not in ('video', 'audio', 'text'):
                          if mime_type == 'image/jpeg':
-                            content_type = 'image/jpeg'
-                        if codecs.split('.')[0] == 'stpp':
+                            content_type = mime_type
+                        elif codecs.split('.')[0] == 'stpp':
                              content_type = 'text'
                          else:
                              self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
@@ -2613,8 +2656,10 @@ def extract_Initialization(source):
                              base_url = base_url_e.text + base_url
                              if re.match(r'^https?://', base_url):
                                  break
-                    if mpd_base_url and not re.match(r'^https?://', base_url):
-                        if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+                    if mpd_base_url and base_url.startswith('/'):
+                        base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
+                    elif mpd_base_url and not re.match(r'^https?://', base_url):
+                        if not mpd_base_url.endswith('/'):
                              mpd_base_url += '/'
                          base_url = mpd_base_url + base_url
                      representation_id = representation_attrib.get('id')
@@ -2642,8 +2687,10 @@ def extract_Initialization(source):
                              'format_note': 'DASH %s' % content_type,
                              'filesize': filesize,
                              'container': mimetype2ext(mime_type) + '_dash',
+                            'manifest_stream_number': stream_numbers[content_type]
                          }
                          f.update(parse_codecs(codecs))
+                        stream_numbers[content_type] += 1
                      elif content_type == 'text':
                          f = {
                              'ext': mimetype2ext(mime_type),
@@ -2661,6 +2708,8 @@ def extract_Initialization(source):
                              'acodec': 'none',
                              'vcodec': 'none',
                          }
+                    if is_drm_protected(adaptation_set) or is_drm_protected(representation):
+                        f['has_drm'] = True
                      representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
  
                      def prepare_template(template_name, identifiers):
@@ -2818,10 +2867,7 @@ def add_segment_url():
      def _extract_ism_formats(self, *args, **kwargs):
          fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the ISM manifest; "
-                "if any subtitle tracks are missing,"
-            ))
+            self._report_ignoring_subs('ISM')
          return fmts
  
      def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
@@ -2847,9 +2893,6 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
          """
          if ism_doc.get('IsLive') == 'TRUE':
              return [], {}
-        if (not self.get_param('allow_unplayable_formats')
-                and ism_doc.find('Protection') is not None):
-            return [], {}
  
          duration = int(ism_doc.attrib['Duration'])
          timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
@@ -2940,6 +2983,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
                          'acodec': 'none' if stream_type == 'video' else fourcc,
                          'protocol': 'ism',
                          'fragments': fragments,
+                        'has_drm': ism_doc.find('Protection') is not None,
                          '_download_params': {
                              'stream_type': stream_type,
                              'duration': duration,
@@ -3083,10 +3127,7 @@ def _media_formats(src, cur_media_type, type_info={}):
      def _extract_akamai_formats(self, *args, **kwargs):
          fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
          if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the manifests; "
-                "if any subtitle tracks are missing,"
-            ))
+            self._report_ignoring_subs('akamai')
          return fmts
  
      def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
@@ -3489,9 +3530,11 @@ def _get_automatic_captions(self, *args, **kwargs):
          raise NotImplementedError('This method must be implemented by subclasses')
  
      def mark_watched(self, *args, **kwargs):
-        if (self.get_param('mark_watched', False)
-                and (self._get_login_info()[0] is not None
-                     or self.get_param('cookiefile') is not None)):
+        if not self.get_param('mark_watched', False):
+            return
+        if (self._get_login_info()[0] is not None
+                or self.get_param('cookiefile')
+                or self.get_param('cookiesfrombrowser')):
              self._mark_watched(*args, **kwargs)
  
      def _mark_watched(self, *args, **kwargs):