[extractor] Framework for embed detection (#4307)

[yt-dlp.git] / yt_dlp / utils.py
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 6e0c31c012ec0b5c4b69bd387bf2844a0dc472c6..545c027635da2213809c01b746155ca0e3e436d4 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -1,3 +1,4 @@
+import asyncio
  import atexit
  import base64
  import binascii
@@ -46,7 +47,7 @@
  import xml.etree.ElementTree
  import zlib
  
-from .compat import asyncio, functools  # isort: split
+from .compat import functools  # isort: split
  from .compat import (
      compat_etree_fromstring,
      compat_expanduser,
@@ -149,21 +150,6 @@ def random_user_agent():
          'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  }
  
-KNOWN_EXTENSIONS = (
-    'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
-    'flv', 'f4v', 'f4a', 'f4b',
-    'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
-    'mkv', 'mka', 'mk3d',
-    'avi', 'divx',
-    'mov',
-    'asf', 'wmv', 'wma',
-    '3gp', '3g2',
-    'mp3',
-    'flac',
-    'ape',
-    'wav',
-    'f4f', 'f4m', 'm3u8', 'smil')
-
  # needed for sanitizing filenames in restricted mode
  ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
                          itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
@@ -222,6 +208,7 @@ def random_user_agent():
      '%d/%m/%Y',
      '%d/%m/%y',
      '%d/%m/%Y %H:%M:%S',
+    '%d-%m-%Y %H:%M',
  ])
  
  DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
@@ -612,7 +599,9 @@ def sanitize_open(filename, open_mode):
      if filename == '-':
          if sys.platform == 'win32':
              import msvcrt
-            msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
+            # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
+            with contextlib.suppress(io.UnsupportedOperation):
+                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
          return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
  
      for attempt in range(2):
@@ -716,13 +705,13 @@ def sanitize_path(s, force=False):
      return os.path.join(*sanitized_path)
  
  
-def sanitize_url(url):
+def sanitize_url(url, *, scheme='http'):
      # Prepend protocol-less URLs with `http:` scheme in order to mitigate
      # the number of unwanted failures due to missing protocol
      if url is None:
          return
      elif url.startswith('//'):
-        return 'http:%s' % url
+        return f'{scheme}:{url}'
      # Fix some common typos seen so far
      COMMON_TYPOS = (
          # https://github.com/ytdl-org/youtube-dl/issues/15649
@@ -1071,6 +1060,14 @@ def __init__(self, msg, countries=None, **kwargs):
          self.countries = countries
  
  
+class UserNotLive(ExtractorError):
+    """Error when a channel/user is not live"""
+
+    def __init__(self, msg=None, **kwargs):
+        kwargs['expected'] = True
+        super().__init__(msg or 'The channel is not currently live', **kwargs)
+
+
  class DownloadError(YoutubeDLError):
      """Download Error exception.
  
@@ -3485,17 +3482,19 @@ def age_restricted(content_limit, age_limit):
      return age_limit < content_limit
  
  
+# List of known byte-order-marks (BOM)
+BOMS = [
+    (b'\xef\xbb\xbf', 'utf-8'),
+    (b'\x00\x00\xfe\xff', 'utf-32-be'),
+    (b'\xff\xfe\x00\x00', 'utf-32-le'),
+    (b'\xff\xfe', 'utf-16-le'),
+    (b'\xfe\xff', 'utf-16-be'),
+]
+
+
  def is_html(first_bytes):
      """ Detect whether a file contains HTML by examining its first bytes. """
  
-    BOMS = [
-        (b'\xef\xbb\xbf', 'utf-8'),
-        (b'\x00\x00\xfe\xff', 'utf-32-be'),
-        (b'\xff\xfe\x00\x00', 'utf-32-le'),
-        (b'\xff\xfe', 'utf-16-le'),
-        (b'\xfe\xff', 'utf-16-be'),
-    ]
-
      encoding = 'utf-8'
      for bom, enc in BOMS:
          while first_bytes.startswith(bom):
@@ -3663,7 +3662,7 @@ def _match_func(info_dict, incomplete=False):
          if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
              return NO_DEFAULT if interactive and not incomplete else None
          else:
-            video_title = info_dict.get('title') or info_dict.get('id') or 'video'
+            video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
              filter_str = ') | ('.join(map(str.strip, filters))
              return f'{video_title} does not pass filter ({filter_str}), skipping ..'
      return _match_func
@@ -5394,6 +5393,24 @@ def read_stdin(what):
      return sys.stdin
  
  
+def determine_file_encoding(data):
+    """
+    Detect the text encoding used
+    @returns (encoding, bytes to skip)
+    """
+
+    # BOM marks are given priority over declarations
+    for bom, enc in BOMS:
+        if data.startswith(bom):
+            return enc, len(bom)
+
+    # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
+    # We ignore the endianness to get a good enough match
+    data = data.replace(b'\0', b'')
+    mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
+    return mobj.group(1).decode() if mobj else None, 0
+
+
  class Config:
      own_args = None
      parsed_args = None
@@ -5445,12 +5462,17 @@ def __str__(self):
      @staticmethod
      def read_file(filename, default=[]):
          try:
-            optionf = open(filename)
+            optionf = open(filename, 'rb')
          except OSError:
              return default  # silently skip if file is not present
+        try:
+            enc, skip = determine_file_encoding(optionf.read(512))
+            optionf.seek(skip, io.SEEK_SET)
+        except OSError:
+            enc = None  # silently skip read errors
          try:
              # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
-            contents = optionf.read()
+            contents = optionf.read().decode(enc or preferredencoding())
              res = shlex.split(contents, comments=True)
          except Exception as err:
              raise ValueError(f'Unable to parse "{filename}": {err}')
@@ -5612,6 +5634,22 @@ def items_(self):
          return self.__dict__.items()
  
  
+MEDIA_EXTENSIONS = Namespace(
+    common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
+    video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
+    common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
+    audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
+    thumbnails=('jpg', 'png', 'webp'),
+    storyboards=('mhtml', ),
+    subtitles=('srt', 'vtt', 'ass', 'lrc'),
+    manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
+)
+MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
+MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
+
+KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
+
+
  # Deprecated
  has_certifi = bool(certifi)
  has_websockets = bool(websockets)