]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/YoutubeDL.py
[cleanup] Remove extractors for some dead websites (#2739)
[yt-dlp.git] / yt_dlp / YoutubeDL.py
index 71369bc44c3a3558ef24c1f8b9de59328c1a7a15..fd1584a7f0b566b17a434384beceb091a5ad3695 100644 (file)
@@ -72,6 +72,7 @@
     GeoRestrictedError,
     get_domain,
     HEADRequest,
+    InAdvancePagedList,
     int_or_none,
     iri_to_uri,
     ISO3166Utils,
@@ -200,9 +201,12 @@ class YoutubeDL(object):
     verbose:           Print additional info to stdout.
     quiet:             Do not print messages to stdout.
     no_warnings:       Do not print out anything for warnings.
-    forceprint:        A dict with keys video/playlist mapped to
-                       a list of templates to force print to stdout
+    forceprint:        A dict with keys WHEN mapped to a list of templates to
+                       print to stdout. The allowed keys are video or any of the
+                       items in utils.POSTPROCESS_WHEN.
                        For compatibility, a single list is also accepted
+    print_to_file:     A dict with keys WHEN (same as forceprint) mapped to
+                       a list of tuples with (template, filename)
     forceurl:          Force printing final URL. (Deprecated)
     forcetitle:        Force printing title. (Deprecated)
     forceid:           Force printing ID. (Deprecated)
@@ -323,6 +327,8 @@ class YoutubeDL(object):
     cookiesfrombrowser:  A tuple containing the name of the browser, the profile
                        name/pathfrom where cookies are loaded, and the name of the
                        keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT')
+    legacyserverconnect: Explicitly allow HTTPS connection to servers that do not
+                       support RFC 5746 secure renegotiation
     nocheckcertificate:  Do not verify SSL certificates
     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                        At the moment, this is only supported by YouTube.
@@ -346,8 +352,8 @@ class YoutubeDL(object):
     postprocessors:    A list of dictionaries, each with an entry
                        * key:  The name of the postprocessor. See
                                yt_dlp/postprocessor/__init__.py for a list.
-                       * when: When to run the postprocessor. Can be one of
-                               pre_process|before_dl|post_process|after_move.
+                       * when: When to run the postprocessor. Allowed values are
+                               the entries of utils.POSTPROCESS_WHEN
                                Assumed to be 'post_process' if not given
     post_hooks:        Deprecated - Register a custom postprocessor instead
                        A list of functions that get called as the final step
@@ -478,6 +484,7 @@ class YoutubeDL(object):
     extractor_args:    A dictionary of arguments to be passed to the extractors.
                        See "EXTRACTOR ARGUMENTS" for details.
                        Eg: {'youtube': {'skip': ['dash', 'hls']}}
+    mark_watched:      Mark videos watched (even with --simulate). Only for YouTube
     youtube_include_dash_manifest: Deprecated - Use extractor_args instead.
                        If True (default), DASH manifests and related
                        data will be downloaded and processed by extractor.
@@ -589,12 +596,14 @@ def check_deprecated(param, option, suggestion):
         else:
             self.params['nooverwrites'] = not self.params['overwrites']
 
+        self.params.setdefault('forceprint', {})
+        self.params.setdefault('print_to_file', {})
+
         # Compatibility with older syntax
-        params.setdefault('forceprint', {})
         if not isinstance(params['forceprint'], dict):
-            params['forceprint'] = {'video': params['forceprint']}
+            self.params['forceprint'] = {'video': params['forceprint']}
 
-        if params.get('bidi_workaround', False):
+        if self.params.get('bidi_workaround', False):
             try:
                 import pty
                 master, slave = pty.openpty()
@@ -622,7 +631,7 @@ def check_deprecated(param, option, suggestion):
 
         if (sys.platform != 'win32'
                 and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
-                and not params.get('restrictfilenames', False)):
+                and not self.params.get('restrictfilenames', False)):
             # Unicode filesystem API will throw errors (#1474, #13027)
             self.report_warning(
                 'Assuming --restrict-filenames since file system encoding '
@@ -1213,10 +1222,17 @@ def _prepare_filename(self, info_dict, tmpl_type='default'):
         try:
             outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']))
             filename = self.evaluate_outtmpl(outtmpl, info_dict, True)
+            if not filename:
+                return None
 
-            force_ext = OUTTMPL_TYPES.get(tmpl_type)
-            if filename and force_ext is not None:
-                filename = replace_extension(filename, force_ext, info_dict.get('ext'))
+            if tmpl_type in ('default', 'temp'):
+                final_ext, ext = self.params.get('final_ext'), info_dict.get('ext')
+                if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'):
+                    filename = replace_extension(filename, ext, final_ext)
+            else:
+                force_ext = OUTTMPL_TYPES[tmpl_type]
+                if force_ext:
+                    filename = replace_extension(filename, force_ext, info_dict.get('ext'))
 
             # https://github.com/blackjack4494/youtube-dlc/issues/85
             trim_file_name = self.params.get('trim_file_name', False)
@@ -1596,6 +1612,19 @@ def _fixup(r):
     def _ensure_dir_exists(self, path):
         return make_dir(path, self.report_error)
 
+    @staticmethod
+    def _playlist_infodict(ie_result, **kwargs):
+        return {
+            **ie_result,
+            'playlist': ie_result.get('title') or ie_result.get('id'),
+            'playlist_id': ie_result.get('id'),
+            'playlist_title': ie_result.get('title'),
+            'playlist_uploader': ie_result.get('uploader'),
+            'playlist_uploader_id': ie_result.get('uploader_id'),
+            'playlist_index': 0,
+            **kwargs,
+        }
+
     def __process_playlist(self, ie_result, download):
         # We process each entry in the playlist
         playlist = ie_result.get('title') or ie_result.get('id')
@@ -1647,6 +1676,9 @@ def get_entry(i):
             msg = 'Downloading %d videos'
             if not isinstance(ie_entries, (PagedList, LazyList)):
                 ie_entries = LazyList(ie_entries)
+            elif isinstance(ie_entries, InAdvancePagedList):
+                if ie_entries._pagesize == 1:
+                    playlist_count = ie_entries._pagecount
 
             def get_entry(i):
                 return YoutubeDL.__handle_extraction_exceptions(
@@ -1694,18 +1726,11 @@ def get_entry(i):
         ie_result['requested_entries'] = playlistitems
 
         _infojson_written = False
-        if not self.params.get('simulate') and self.params.get('allow_playlist_files', True):
-            ie_copy = {
-                'playlist': playlist,
-                'playlist_id': ie_result.get('id'),
-                'playlist_title': ie_result.get('title'),
-                'playlist_uploader': ie_result.get('uploader'),
-                'playlist_uploader_id': ie_result.get('uploader_id'),
-                'playlist_index': 0,
-                'n_entries': n_entries,
-            }
-            ie_copy.update(dict(ie_result))
-
+        write_playlist_files = self.params.get('allow_playlist_files', True)
+        if write_playlist_files and self.params.get('list_thumbnails'):
+            self.list_thumbnails(ie_result)
+        if write_playlist_files and not self.params.get('simulate'):
+            ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries)
             _infojson_written = self._write_info_json(
                 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson'))
             if _infojson_written is None:
@@ -2215,10 +2240,7 @@ def restore_last_token(self):
 
     def _calc_headers(self, info_dict):
         res = std_headers.copy()
-
-        add_headers = info_dict.get('http_headers')
-        if add_headers:
-            res.update(add_headers)
+        res.update(info_dict.get('http_headers') or {})
 
         cookies = self._calc_cookies(info_dict)
         if cookies:
@@ -2281,10 +2303,17 @@ def process_video_result(self, info_dict, download=True):
         self._num_videos += 1
 
         if 'id' not in info_dict:
-            raise ExtractorError('Missing "id" field in extractor result')
+            raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor'])
+        elif not info_dict.get('id'):
+            raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor'])
+
+        info_dict['fulltitle'] = info_dict.get('title')
         if 'title' not in info_dict:
             raise ExtractorError('Missing "title" field in extractor result',
                                  video_id=info_dict['id'], ie=info_dict['extractor'])
+        elif not info_dict.get('title'):
+            self.report_warning('Extractor failed to obtain "title". Creating a generic title instead')
+            info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}'
 
         def report_force_conversion(field, field_not, conversion):
             self.report_warning(
@@ -2392,9 +2421,6 @@ def sanitize_numeric_fields(info):
         if not self.params.get('allow_unplayable_formats'):
             formats = [f for f in formats if not f.get('has_drm')]
 
-        # backward compatibility
-        info_dict['fulltitle'] = info_dict['title']
-
         if info_dict.get('is_live'):
             get_from_start = bool(self.params.get('live_from_start'))
             formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
@@ -2671,19 +2697,32 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
             subs[lang] = f
         return subs
 
-    def _forceprint(self, tmpl, info_dict):
-        mobj = re.match(r'\w+(=?)$', tmpl)
-        if mobj and mobj.group(1):
-            tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s'
-        elif mobj:
-            tmpl = '%({})s'.format(tmpl)
+    def _forceprint(self, key, info_dict):
+        if info_dict is None:
+            return
+        info_copy = info_dict.copy()
+        info_copy['formats_table'] = self.render_formats_table(info_dict)
+        info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict)
+        info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
+        info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
+
+        def format_tmpl(tmpl):
+            mobj = re.match(r'\w+(=?)$', tmpl)
+            if mobj and mobj.group(1):
+                return f'{tmpl[:-1]} = %({tmpl[:-1]})r'
+            elif mobj:
+                return f'%({tmpl})s'
+            return tmpl
 
-        info_dict = info_dict.copy()
-        info_dict['formats_table'] = self.render_formats_table(info_dict)
-        info_dict['thumbnails_table'] = self.render_thumbnails_table(info_dict)
-        info_dict['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles'))
-        info_dict['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions'))
-        self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict))
+        for tmpl in self.params['forceprint'].get(key, []):
+            self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy))
+
+        for tmpl, file_tmpl in self.params['print_to_file'].get(key, []):
+            filename = self.evaluate_outtmpl(file_tmpl, info_dict)
+            tmpl = format_tmpl(tmpl)
+            self.to_screen(f'[info] Writing {tmpl!r} to: {filename}')
+            with io.open(filename, 'a', encoding='utf-8') as f:
+                f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n')
 
     def __forced_printings(self, info_dict, filename, incomplete):
         def print_mandatory(field, actual_field=None):
@@ -2707,10 +2746,11 @@ def print_optional(field):
         elif 'url' in info_dict:
             info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '')
 
-        if self.params['forceprint'].get('video') or self.params.get('forcejson'):
+        if (self.params.get('forcejson')
+                or self.params['forceprint'].get('video')
+                or self.params['print_to_file'].get('video')):
             self.post_extract(info_dict)
-        for tmpl in self.params['forceprint'].get('video', []):
-            self._forceprint(tmpl, info_dict)
+        self._forceprint('video', info_dict)
 
         print_mandatory('title')
         print_mandatory('id')
@@ -2748,7 +2788,9 @@ def dl(self, name, info, subtitle=False, test=False):
         if not test:
             for ph in self._progress_hooks:
                 fd.add_progress_hook(ph)
-            urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']])
+            urls = '", "'.join(
+                (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url'])
+                for f in info.get('requested_formats', []) or [info])
             self.write_debug('Invoking downloader on "%s"' % urls)
 
         # Note: Ideally info should be a deep-copied so that hooks cannot modify it.
@@ -3197,6 +3239,7 @@ def sanitize_info(info_dict, remove_private_keys=False):
         if info_dict is None:
             return info_dict
         info_dict.setdefault('epoch', int(time.time()))
+        info_dict.setdefault('_type', 'video')
         remove_keys = {'__original_infodict'}  # Always remove this since this may contain a copy of the entire dict
         keep_keys = ['_type']  # Always keep this to facilitate load-info-json
         if remove_private_keys:
@@ -3275,8 +3318,7 @@ def run_pp(self, pp, infodict):
         return infodict
 
     def run_all_pps(self, key, info, *, additional_pps=None):
-        for tmpl in self.params['forceprint'].get(key, []):
-            self._forceprint(tmpl, info)
+        self._forceprint(key, info)
         for pp in (additional_pps or []) + self._pps[key]:
             info = self.run_pp(pp, info)
         return info
@@ -3471,12 +3513,12 @@ def render_formats_table(self, info_dict):
             delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))
 
     def render_thumbnails_table(self, info_dict):
-        thumbnails = list(info_dict.get('thumbnails'))
+        thumbnails = list(info_dict.get('thumbnails') or [])
         if not thumbnails:
             return None
         return render_table(
             self._list_format_headers('ID', 'Width', 'Height', 'URL'),
-            [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
+            [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])
 
     def render_subtitles_table(self, video_id, subtitles):
         def _row(lang, formats):