[utils, cleanup] Consolidate known media extensions

[yt-dlp.git] / yt_dlp / utils.py
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py

index 5d4e607abaaf76e6dff543b5110fc0d392fc84d9..fcc25388d8f0449b7adca956fd3d75adea6c371c 100644 (file)
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -1,3 +1,4 @@
+import asyncio
  import atexit
  import base64
  import binascii
@@ -46,7 +47,7 @@
  import xml.etree.ElementTree
  import zlib
  
-from .compat import asyncio, functools  # isort: split
+from .compat import functools  # isort: split
  from .compat import (
      compat_etree_fromstring,
      compat_expanduser,
@@ -149,21 +150,6 @@ def random_user_agent():
          'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  }
  
-KNOWN_EXTENSIONS = (
-    'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
-    'flv', 'f4v', 'f4a', 'f4b',
-    'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
-    'mkv', 'mka', 'mk3d',
-    'avi', 'divx',
-    'mov',
-    'asf', 'wmv', 'wma',
-    '3gp', '3g2',
-    'mp3',
-    'flac',
-    'ape',
-    'wav',
-    'f4f', 'f4m', 'm3u8', 'smil')
-
  # needed for sanitizing filenames in restricted mode
  ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
                          itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
@@ -1071,6 +1057,14 @@ def __init__(self, msg, countries=None, **kwargs):
          self.countries = countries
  
  
+class UserNotLive(ExtractorError):
+    """Error when a channel/user is not live"""
+
+    def __init__(self, msg=None, **kwargs):
+        kwargs['expected'] = True
+        super().__init__(msg or 'The channel is not currently live', **kwargs)
+
+
  class DownloadError(YoutubeDLError):
      """Download Error exception.
  
@@ -3485,6 +3479,7 @@ def age_restricted(content_limit, age_limit):
      return age_limit < content_limit
  
  
+# List of known byte-order-marks (BOM)
  BOMS = [
      (b'\xef\xbb\xbf', 'utf-8'),
      (b'\x00\x00\xfe\xff', 'utf-32-be'),
@@ -3492,7 +3487,6 @@ def age_restricted(content_limit, age_limit):
      (b'\xff\xfe', 'utf-16-le'),
      (b'\xfe\xff', 'utf-16-be'),
  ]
-""" List of known byte-order-marks (BOM) """
  
  
  def is_html(first_bytes):
@@ -3665,7 +3659,7 @@ def _match_func(info_dict, incomplete=False):
          if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
              return NO_DEFAULT if interactive and not incomplete else None
          else:
-            video_title = info_dict.get('title') or info_dict.get('id') or 'video'
+            video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
              filter_str = ') | ('.join(map(str.strip, filters))
              return f'{video_title} does not pass filter ({filter_str}), skipping ..'
      return _match_func
@@ -5398,37 +5392,20 @@ def read_stdin(what):
  
  def determine_file_encoding(data):
      """
-    From the first 512 bytes of a given file,
-    it tries to detect the encoding to be used to read as text.
-
+    Detect the text encoding used
      @returns (encoding, bytes to skip)
      """
  
+    # BOM marks are given priority over declarations
      for bom, enc in BOMS:
-        # matching BOM beats any declaration
-        # BOMs are skipped to prevent any errors
          if data.startswith(bom):
              return enc, len(bom)
  
-    # strip off all null bytes to match even when UTF-16 or UTF-32 is used
-    # endians don't matter
+    # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
+    # We ignore the endianness to get a good enough match
      data = data.replace(b'\0', b'')
-
-    PREAMBLES = [
-        # "# -*- coding: utf-8 -*-"
-        # "# coding: utf-8"
-        rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
-        # "# vi: set fileencoding=utf-8"
-        rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
-    ]
-    for pb in PREAMBLES:
-        mobj = re.match(pb, data)
-        if not mobj:
-            continue
-        # preambles aren't skipped since they're just ignored when reading as config
-        return mobj.group('encoding').decode(), 0
-
-    return None, 0
+    mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
+    return mobj.group(1).decode() if mobj else None, 0
  
  
  class Config:
@@ -5654,6 +5631,22 @@ def items_(self):
          return self.__dict__.items()
  
  
+MEDIA_EXTENSIONS = Namespace(
+    common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
+    video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
+    common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
+    audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
+    thumbnails=('jpg', 'png', 'webp'),
+    storyboards=('mhtml', ),
+    subtitles=('srt', 'vtt', 'ass', 'lrc'),
+    manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
+)
+MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
+MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
+
+KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
+
+
  # Deprecated
  has_certifi = bool(certifi)
  has_websockets = bool(websockets)