]> jfr.im git - yt-dlp.git/commitdiff
[extractor] Improve `_generic_title`
authorpukkandan <redacted>
Mon, 31 Oct 2022 12:05:20 +0000 (17:35 +0530)
committerpukkandan <redacted>
Mon, 31 Oct 2022 12:11:48 +0000 (17:41 +0530)
16 files changed:
yt_dlp/extractor/arte.py
yt_dlp/extractor/bbc.py
yt_dlp/extractor/breitbart.py
yt_dlp/extractor/callin.py
yt_dlp/extractor/common.py
yt_dlp/extractor/cspan.py
yt_dlp/extractor/fivetv.py
yt_dlp/extractor/generic.py
yt_dlp/extractor/genericembeds.py
yt_dlp/extractor/glide.py
yt_dlp/extractor/meipai.py
yt_dlp/extractor/nhk.py
yt_dlp/extractor/onenewsnz.py
yt_dlp/extractor/steam.py
yt_dlp/extractor/tennistv.py
yt_dlp/extractor/tv24ua.py

index d3ec4a66c8c9d76feff92e0da5445491bee6c9f9..b60fa0233e0ae319ba62b842ee769442e7c921b1 100644 (file)
@@ -303,9 +303,7 @@ def _real_extract(self, url):
             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
                 items.append(video)
 
-        title = (self._og_search_title(webpage, default=None)
-                 or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
-        title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
+        title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None
 
         return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title,
                                           description=self._og_search_description(webpage, default=None))
index 9a0a4414e7e6465b5d4f94b5d0b64f33e85cac81..89fce8d5a8715724000a0b2aef21b8a2c63a5f5f 100644 (file)
@@ -898,12 +898,8 @@ def _real_extract(self, url):
         json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
         timestamp = json_ld_info.get('timestamp')
 
-        playlist_title = json_ld_info.get('title')
-        if not playlist_title:
-            playlist_title = (self._og_search_title(webpage, default=None)
-                              or self._html_extract_title(webpage, 'playlist title', default=None))
-            if playlist_title:
-                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+        playlist_title = json_ld_info.get('title') or re.sub(
+            r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
 
         playlist_description = json_ld_info.get(
             'description') or self._og_search_description(webpage, default=None)
index a2b04fccee84f6514fe57dae590149d363ee9ba8..ca5757374d6a3f308af8f09e2c84c837aaf81480 100644 (file)
@@ -27,8 +27,7 @@ def _real_extract(self, url):
         self._sort_formats(formats)
         return {
             'id': video_id,
-            'title': (self._og_search_title(webpage, default=None)
-                      or self._html_extract_title(webpage, 'video title')),
+            'title': self._generic_title('', webpage),
             'description': self._og_search_description(webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
             'age_limit': self._rta_search(webpage),
index fc5da7028333a687e5b209db59cfefea0bf80e7b..6c8129f0658149d627520ddd2ffb093815de456f 100644 (file)
@@ -51,9 +51,7 @@ def _real_extract(self, url):
         episode = next_data['props']['pageProps']['episode']
 
         id = episode['id']
-        title = (episode.get('title')
-                 or self._og_search_title(webpage, fatal=False)
-                 or self._html_extract_title(webpage))
+        title = episode.get('title') or self._generic_title('', webpage)
         url = episode['m3u8']
         formats = self._extract_m3u8_formats(url, display_id, ext='ts')
         self._sort_formats(formats)
index fb787a722130078b2b7ffdb624a9226fe4756f8b..84a2b95af5fd4ce124615873957e4635baabc901 100644 (file)
@@ -3820,9 +3820,11 @@ def geo_verification_headers(self):
     def _generic_id(url):
         return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 
-    @staticmethod
-    def _generic_title(url):
-        return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
+    def _generic_title(self, url='', webpage='', *, default=None):
+        return (self._og_search_title(webpage, default=None)
+                or self._html_extract_title(webpage, default=None)
+                or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
+                or default)
 
     @staticmethod
     def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
index 84393627a530a46b25e998e16253bebe63186fb6..1184633f530a6b8fdc4b410cf3344927afa2eb74 100644 (file)
@@ -275,8 +275,7 @@ def _real_extract(self, url):
             self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'),
             video_id, transform_source=js_to_json)
 
-        title = (self._og_search_title(webpage, default=None)
-                 or self._html_extract_title(webpage, 'video title'))
+        title = self._generic_title('', webpage)
         description = (self._og_search_description(webpage, default=None)
                        or self._html_search_meta('description', webpage, 'description', default=None))
 
index 448c332b3dc6e05fd0b1758380b02d394a4cce83..1f48cfd363cd6c7fe1a3b0d3eeaaf3adc548ef21 100644 (file)
@@ -71,7 +71,7 @@ def _real_extract(self, url):
              r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
             webpage, 'video url')
 
-        title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage)
+        title = self._generic_title('', webpage)
         duration = int_or_none(self._og_search_property(
             'video:duration', webpage, 'duration', default=None))
 
index 5abde33a91dd7976ab6f93a0c7f47896862e6044..b0b26b61ad4f08d8d127b83f6337668c1580c2df 100644 (file)
@@ -2740,8 +2740,7 @@ def _real_extract(self, url):
             #   Site Name | Video Title
             #   Video Title - Tagline | Site Name
             # and so on and so forth; it's just not practical
-            'title': (self._og_search_title(webpage, default=None)
-                      or self._html_extract_title(webpage, 'video title', default='video')),
+            'title': self._generic_title('', webpage, default='video'),
             'description': self._og_search_description(webpage, default=None),
             'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'age_limit': self._rta_search(webpage),
index 1bffe275a87186f15ddb0d53b5dab5d97538976b..45e1618ba0835bbb4c83a6fd674fc3422bb2f663 100644 (file)
@@ -20,7 +20,7 @@ class HTML5MediaEmbedIE(InfoExtractor):
     ]
 
     def _extract_from_webpage(self, url, webpage):
-        video_id, title = self._generic_id(url), self._generic_title(url)
+        video_id, title = self._generic_id(url), self._generic_title(url, webpage)
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or []
         for num, entry in enumerate(entries, start=1):
             entry.update({
index 2bffb26dc29a275c877b1de20d500bc28bc6aa88..d114f3494cc5c6cb3b672a65977b7a19c09c08c6 100644 (file)
@@ -20,7 +20,7 @@ def _real_extract(self, url):
 
         webpage = self._download_webpage(url, video_id)
 
-        title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage)
+        title = self._generic_title('', webpage)
         video_url = self._proto_relative_url(self._search_regex(
             r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
             webpage, 'video URL', default=None,
index 95b6dfe52edc097bb8f597cb00bdb4b87099ffff..1a6f3cd7486a6cc56f735a4d4b296b9e0ce4f5ad 100644 (file)
@@ -48,9 +48,7 @@ def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        title = self._og_search_title(
-            webpage, default=None) or self._html_search_regex(
-            r'<title[^>]*>([^<]+)</title>', webpage, 'title')
+        title = self._generic_title('', webpage)
 
         formats = []
 
index 60d76d1b118c4f1f42c1b160ff5fd0c38c6b9fdc..517660ef107270cf796959e12f28fce98ce44b57 100644 (file)
@@ -321,8 +321,7 @@ def _real_extract(self, url):
 
         webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id)
 
-        title = (self._og_search_title(webpage)
-                 or self._html_extract_title(webpage)
+        title = (self._generic_title('', webpage)
                  or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False))
         title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None
         description = self._html_search_regex(
index 59d4490d0f02e3a15fb277b58efb7a40117185aa..a46211e77728b4d5d64a0be5bd30186744cd7ebe 100644 (file)
@@ -106,7 +106,6 @@ def _real_extract(self, url):
 
         playlist_title = (
             traverse_obj(fusion_metadata, ('headlines', 'basic'))
-            or self._og_search_title(webpage)
-            or self._html_extract_title(webpage)
+            or self._generic_title('', webpage)
         )
         return self.playlist_result(entries, display_id, playlist_title)
index e15c22f2a73247bd5bd424bbe331348f9aa2ac42..eea20ff855a4c8e14af5b56e1e808ebd417044cc 100644 (file)
@@ -166,7 +166,7 @@ def _real_extract(self, url):
         self._sort_formats(formats)
         return {
             'id': video_id,
-            'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
+            'title': self._generic_title('', webpage),
             'formats': formats,
             'live_status': 'is_live',
             'view_count': json_data.get('num_view'),
index 5baa21d52a0c5fc5ccd8b58d3141a269ddc12a80..47cb0965e0c9236f65a31a7c6e40aa945e08a7ed 100644 (file)
@@ -142,7 +142,7 @@ def _real_extract(self, url):
 
         return {
             'id': video_id,
-            'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
+            'title': self._generic_title('', webpage),
             'description': self._html_search_regex(
                 (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')),
                 webpage, 'description', fatal=False),
index 2f2571df76a145afe636741fbd29a6d2194d368f..8d2475296f8e1c45b9fc62abdd8263f677883162 100644 (file)
@@ -74,6 +74,6 @@ def _real_extract(self, url):
             'formats': formats,
             'subtitles': subtitles,
             'thumbnail': thumbnail or self._og_search_thumbnail(webpage),
-            'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
+            'title': self._generic_title('', webpage),
             'description': self._og_search_description(webpage, default=None),
         }