[extractor/rutube] Extract chapters from description (#6345)

author mushbite <redacted>

Sat, 4 Mar 2023 13:33:17 +0000 (15:33 +0200)

committer GitHub <redacted>

Sat, 4 Mar 2023 13:33:17 +0000 (19:03 +0530)
author mushbite <redacted>
Sat, 4 Mar 2023 13:33:17 +0000 (15:33 +0200)
committer GitHub <redacted>
Sat, 4 Mar 2023 13:33:17 +0000 (19:03 +0530)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 8ad63b4118fddd25a35f59764d0f659b4fa494c2..2091df7fafce774df9a37cbca253ae9428b40ef7 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -3649,6 +3649,38 @@ def _generic_title(self, url='', webpage='', *, default=None):
                  or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
                  or default)
  
+    def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
+        if not duration:
+            return
+        chapter_list = [{
+            'start_time': start_function(chapter),
+            'title': title_function(chapter),
+        } for chapter in chapter_list or []]
+        if not strict:
+            chapter_list.sort(key=lambda c: c['start_time'] or 0)
+
+        chapters = [{'start_time': 0}]
+        for idx, chapter in enumerate(chapter_list):
+            if chapter['start_time'] is None:
+                self.report_warning(f'Incomplete chapter {idx}')
+            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
+                chapters.append(chapter)
+            elif chapter not in chapters:
+                self.report_warning(
+                    f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
+        return chapters[1:]
+
+    def _extract_chapters_from_description(self, description, duration):
+        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
+        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
+        return self._extract_chapters_helper(
+            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
+            start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
+            duration=duration, strict=False) or self._extract_chapters_helper(
+            re.findall(sep_re % (r'.+?', duration_re), description or ''),
+            start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
+            duration=duration, strict=False)
+
      @staticmethod
      def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
          all_known = all(map(
diff --git a/yt_dlp/extractor/rutube.py b/yt_dlp/extractor/rutube.py

index 97e6354b4222078e33213dc6d78152faf8e0eaf7..08d9b9257dab191238276391482052d848734166 100644 (file)
--- a/yt_dlp/extractor/rutube.py
+++ b/yt_dlp/extractor/rutube.py
@@ -25,8 +25,7 @@ def _download_api_info(self, video_id, query=None):
              video_id, 'Downloading video JSON',
              'Unable to download video JSON', query=query)
  
-    @staticmethod
-    def _extract_info(video, video_id=None, require_title=True):
+    def _extract_info(self, video, video_id=None, require_title=True):
          title = video['title'] if require_title else video.get('title')
  
          age_limit = video.get('is_adult')
@@ -35,13 +34,15 @@ def _extract_info(video, video_id=None, require_title=True):
  
          uploader_id = try_get(video, lambda x: x['author']['id'])
          category = try_get(video, lambda x: x['category']['name'])
+        description = video.get('description')
+        duration = int_or_none(video.get('duration'))
  
          return {
              'id': video.get('id') or video_id if video_id else video['id'],
              'title': title,
-            'description': video.get('description'),
+            'description': description,
              'thumbnail': video.get('thumbnail_url'),
-            'duration': int_or_none(video.get('duration')),
+            'duration': duration,
              'uploader': try_get(video, lambda x: x['author']['name']),
              'uploader_id': compat_str(uploader_id) if uploader_id else None,
              'timestamp': unified_timestamp(video.get('created_ts')),
@@ -50,6 +51,7 @@ def _extract_info(video, video_id=None, require_title=True):
              'view_count': int_or_none(video.get('hits')),
              'comment_count': int_or_none(video.get('comments_count')),
              'is_live': bool_or_none(video.get('is_livestream')),
+            'chapters': self._extract_chapters_from_description(description, duration),
          }
  
      def _download_and_extract_info(self, video_id, query=None):
@@ -111,8 +113,9 @@ class RutubeIE(RutubeBaseIE):
              'view_count': int,
              'thumbnail': 'http://pic.rutubelist.ru/video/d2/a0/d2a0aec998494a396deafc7ba2c82add.jpg',
              'category': ['Новости и СМИ'],
-
+            'chapters': [],
          },
+        'expected_warnings': ['Unable to download f4m'],
      }, {
          'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
          'only_matching': True,
@@ -142,7 +145,28 @@ class RutubeIE(RutubeBaseIE):
              'view_count': int,
              'thumbnail': 'http://pic.rutubelist.ru/video/f2/d4/f2d42b54be0a6e69c1c22539e3152156.jpg',
              'category': ['Видеоигры'],
+            'chapters': [],
+        },
+        'expected_warnings': ['Unable to download f4m'],
+    }, {
+        'url': 'https://rutube.ru/video/c65b465ad0c98c89f3b25cb03dcc87c6/',
+        'info_dict': {
+            'id': 'c65b465ad0c98c89f3b25cb03dcc87c6',
+            'ext': 'mp4',
+            'chapters': 'count:4',
+            'category': ['Бизнес и предпринимательство'],
+            'description': 'md5:252feac1305257d8c1bab215cedde75d',
+            'thumbnail': 'http://pic.rutubelist.ru/video/71/8f/718f27425ea9706073eb80883dd3787b.png',
+            'duration': 782,
+            'age_limit': 0,
+            'uploader_id': '23491359',
+            'timestamp': 1677153329,
+            'view_count': int,
+            'upload_date': '20230223',
+            'title': 'Бизнес с нуля: найм сотрудников. Интервью с директором строительной компании',
+            'uploader': 'Стас Быков',
          },
+        'expected_warnings': ['Unable to download f4m'],
      }]
  
      @classmethod
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index b02e0153af76d00133e709a6b6d7929635f31bd3..b8bb980f33986b76a649a38913af926e3d6a8ac0 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -3205,11 +3205,11 @@ def _extract_chapters_from_json(self, data, duration):
                  'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
              ), expected_type=list)
  
-        return self._extract_chapters(
+        return self._extract_chapters_helper(
              chapter_list,
-            chapter_time=lambda chapter: float_or_none(
+            start_function=lambda chapter: float_or_none(
                  traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
-            chapter_title=lambda chapter: traverse_obj(
+            title_function=lambda chapter: traverse_obj(
                  chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
              duration=duration)
  
@@ -3222,42 +3222,10 @@ def _extract_chapters_from_engagement_panel(self, data, duration):
          chapter_title = lambda chapter: self._get_text(chapter, 'title')
  
          return next(filter(None, (
-            self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
-                                   chapter_time, chapter_title, duration)
+            self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
+                                          chapter_time, chapter_title, duration)
              for contents in content_list)), [])
  
-    def _extract_chapters_from_description(self, description, duration):
-        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
-        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
-        return self._extract_chapters(
-            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
-            chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1],
-            duration=duration, strict=False) or self._extract_chapters(
-            re.findall(sep_re % (r'.+?', duration_re), description or ''),
-            chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0],
-            duration=duration, strict=False)
-
-    def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True):
-        if not duration:
-            return
-        chapter_list = [{
-            'start_time': chapter_time(chapter),
-            'title': chapter_title(chapter),
-        } for chapter in chapter_list or []]
-        if not strict:
-            chapter_list.sort(key=lambda c: c['start_time'] or 0)
-
-        chapters = [{'start_time': 0}]
-        for idx, chapter in enumerate(chapter_list):
-            if chapter['start_time'] is None:
-                self.report_warning(f'Incomplete chapter {idx}')
-            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
-                chapters.append(chapter)
-            elif chapter not in chapters:
-                self.report_warning(
-                    f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
-        return chapters[1:]
-
      def _extract_comment(self, comment_renderer, parent=None):
          comment_id = comment_renderer.get('commentId')
          if not comment_id:
author	mushbite <redacted>
	Sat, 4 Mar 2023 13:33:17 +0000 (15:33 +0200)
committer	GitHub <redacted>
	Sat, 4 Mar 2023 13:33:17 +0000 (19:03 +0530)
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/rutube.py		patch \| blob \| blame \| history
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history