]> jfr.im git - yt-dlp.git/commitdiff
[ie/theplatform] Extract more metadata (#8635)
authortrainman261 <redacted>
Tue, 12 Dec 2023 00:00:35 +0000 (01:00 +0100)
committerGitHub <redacted>
Tue, 12 Dec 2023 00:00:35 +0000 (00:00 +0000)
Authored by: trainman261

yt_dlp/extractor/aenetworks.py
yt_dlp/extractor/cbc.py
yt_dlp/extractor/cwtv.py
yt_dlp/extractor/mediaset.py
yt_dlp/extractor/nbc.py
yt_dlp/extractor/scrippsnetworks.py
yt_dlp/extractor/theplatform.py

index cc26653c1deee0e684a945a237ab84df8dae8b96..63a0532ef113f5dcb7ad3ac0ef1da0bd12c86d23 100644 (file)
@@ -121,11 +121,21 @@ class AENetworksIE(AENetworksBaseIE):
         'info_dict': {
             'id': '22253814',
             'ext': 'mp4',
-            'title': 'Winter is Coming',
-            'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
+            'title': 'Winter Is Coming',
+            'description': 'md5:a40e370925074260b1c8a633c632c63a',
             'timestamp': 1338306241,
             'upload_date': '20120529',
             'uploader': 'AENE-NEW',
+            'duration': 2592.0,
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:5',
+            'tags': 'count:14',
+            'categories': ['Mountain Men'],
+            'episode_number': 1,
+            'episode': 'Episode 1',
+            'season': 'Season 1',
+            'season_number': 1,
+            'series': 'Mountain Men',
         },
         'params': {
             # m3u8 download
@@ -143,6 +153,15 @@ class AENetworksIE(AENetworksBaseIE):
             'timestamp': 1452634428,
             'upload_date': '20160112',
             'uploader': 'AENE-NEW',
+            'duration': 1277.695,
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:4',
+            'tags': 'count:23',
+            'episode': 'Episode 1',
+            'episode_number': 1,
+            'season': 'Season 9',
+            'season_number': 9,
+            'series': 'Duck Dynasty',
         },
         'params': {
             # m3u8 download
index 29f0e307d132908255a735abcbf5da485fbdabd0..b5beb1ec8cdc01b5cce5a6e37f04c60c118469e9 100644 (file)
@@ -180,6 +180,13 @@ class CBCPlayerIE(InfoExtractor):
             'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
             'chapters': [],
             'duration': 494.811,
+            'categories': ['AudioMobile/All in a Weekend Montreal'],
+            'tags': 'count:8',
+            'location': 'Quebec',
+            'series': 'All in a Weekend Montreal',
+            'season': 'Season 2015',
+            'season_number': 2015,
+            'media_type': 'Excerpt',
         },
     }, {
         'url': 'http://www.cbc.ca/player/play/2164402062',
@@ -195,25 +202,37 @@ class CBCPlayerIE(InfoExtractor):
             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
             'chapters': [],
             'duration': 186.867,
+            'series': 'CBC News: Windsor at 6:00',
+            'categories': ['News/Canada/Windsor'],
+            'location': 'Windsor',
+            'tags': ['cancer'],
+            'creator': 'Allison Johnson',
+            'media_type': 'Excerpt',
         },
     }, {
         # Has subtitles
         # These broadcasts expire after ~1 month, can find new test URL here:
         # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
-        'url': 'http://www.cbc.ca/player/play/2249992771553',
-        'md5': '2f2fb675dd4f0f8a5bb7588d1b13bacd',
+        'url': 'http://www.cbc.ca/player/play/2284799043667',
+        'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
         'info_dict': {
-            'id': '2249992771553',
+            'id': '2284799043667',
             'ext': 'mp4',
-            'title': 'The National | Women’s soccer pay, Florida seawater, Swift quake',
-            'description': 'md5:adba28011a56cfa47a080ff198dad27a',
-            'timestamp': 1690596000,
-            'duration': 2716.333,
+            'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
+            'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
+            'timestamp': 1700272800,
+            'duration': 2718.833,
             'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
-            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/481/326/thumbnail.jpeg',
+            'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
             'uploader': 'CBCC-NEW',
             'chapters': 'count:5',
-            'upload_date': '20230729',
+            'upload_date': '20231118',
+            'categories': 'count:4',
+            'series': 'The National - Full Show',
+            'tags': 'count:1',
+            'creator': 'News',
+            'location': 'Canada',
+            'media_type': 'Full Program',
         },
     }]
 
index 9b83264ee1617ee445e8f1381bec18d2e8207abf..69d50daf6c4e6712b44e01c76a6dead2df4affa9 100644 (file)
@@ -46,6 +46,10 @@ class CWTVIE(InfoExtractor):
             'timestamp': 1444107300,
             'age_limit': 14,
             'uploader': 'CWTV',
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'chapters': 'count:4',
+            'episode': 'Episode 20',
+            'season': 'Season 11',
         },
         'params': {
             # m3u8 download
index 2d620429827248ed588c5c676455bd7ef523d9f8..e04a1ce9017140ee9add71a878b6e80c1c940dd6 100644 (file)
@@ -73,6 +73,7 @@ class MediasetIE(ThePlatformBaseIE):
             'season_number': 5,
             'episode_number': 5,
             'chapters': [{'start_time': 0.0, 'end_time': 3409.08}, {'start_time': 3409.08, 'end_time': 6565.008}],
+            'categories': ['Informazione'],
         },
     }, {
         # DRM
@@ -149,6 +150,7 @@ class MediasetIE(ThePlatformBaseIE):
             'season_number': 12,
             'episode': 'Episode 8',
             'episode_number': 8,
+            'categories': ['Intrattenimento'],
         },
         'params': {
             'skip_download': True,
index 2d3aa26ec9a913fa866fb8fbd5249785ab6c5d2a..267fa8353231aebbe8c0e8dbceae060a45a0d547 100644 (file)
@@ -53,6 +53,8 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                 'chapters': 'count:1',
                 'tags': 'count:4',
                 'thumbnail': r're:https?://.+\.jpg',
+                'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
+                'media_type': 'Full Episode',
             },
             'params': {
                 'skip_download': 'm3u8',
@@ -131,6 +133,8 @@ class NBCIE(ThePlatformIE):  # XXX: Do not subclass from concrete IE
                 'tags': 'count:10',
                 'age_limit': 0,
                 'thumbnail': r're:https?://.+\.jpg',
+                'categories': ['Series/Quantum Leap 2022'],
+                'media_type': 'Highlight',
             },
             'params': {
                 'skip_download': 'm3u8',
index 7f0bc9645610bf62a1087f294416443156470949..3912f778658e1583b8d41974ef97aafe70d83697 100644 (file)
@@ -114,6 +114,8 @@ class ScrippsNetworksIE(InfoExtractor):
             'timestamp': 1475678834,
             'upload_date': '20161005',
             'uploader': 'SCNI-SCND',
+            'tags': 'count:10',
+            'creator': 'Cooking Channel',
             'duration': 29.995,
             'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': '<Untitled Chapter 1>'}],
             'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg',
index 433ce8427c5dec84fda2447b841c2026e4f0a6c1..9160f5ec6b8e95932c86383f09dcee9aa784fba2 100644 (file)
@@ -104,6 +104,10 @@ def _add_chapter(start_time, end_time):
                 _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
             _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
 
+        def extract_site_specific_field(field):
+            # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
+            return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
+
         return {
             'title': info['title'],
             'subtitles': subtitles,
@@ -113,6 +117,14 @@ def _add_chapter(start_time, end_time):
             'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
             'uploader': info.get('billingCode'),
             'chapters': chapters,
+            'creator': traverse_obj(info, ('author', {str})) or None,
+            'categories': traverse_obj(info, (
+                'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
+            'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
+            'location': extract_site_specific_field('region'),
+            'series': extract_site_specific_field('show'),
+            'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
+            'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
         }
 
     def _extract_theplatform_metadata(self, path, video_id):