[extractor] Handle `json_ld` with multiple `@type`s

author pukkandan <redacted>

Mon, 13 Jun 2022 13:09:58 +0000 (18:39 +0530)

committer pukkandan <redacted>

Mon, 13 Jun 2022 13:42:34 +0000 (19:12 +0530)
author pukkandan <redacted>
Mon, 13 Jun 2022 13:09:58 +0000 (18:39 +0530)
committer pukkandan <redacted>
Mon, 13 Jun 2022 13:42:34 +0000 (19:12 +0530)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py

index 1cf8581b6c73286d9d23faca8f06d0d4323c1102..7adabf6f973fce21e2aacb24e19a5d34738aefe9 100644 (file)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1419,6 +1419,10 @@ def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
              'ViewAction': 'view',
          }
  
+        def is_type(e, *expected_types):
+            type = variadic(traverse_obj(e, '@type'))
+            return any(x in type for x in expected_types)
+
          def extract_interaction_type(e):
              interaction_type = e.get('interactionType')
              if isinstance(interaction_type, dict):
@@ -1432,9 +1436,7 @@ def extract_interaction_statistic(e):
              if not isinstance(interaction_statistic, list):
                  return
              for is_e in interaction_statistic:
-                if not isinstance(is_e, dict):
-                    continue
-                if is_e.get('@type') != 'InteractionCounter':
+                if not is_type(is_e, 'InteractionCounter'):
                      continue
                  interaction_type = extract_interaction_type(is_e)
                  if not interaction_type:
@@ -1471,7 +1473,7 @@ def extract_chapter_information(e):
                  info['chapters'] = chapters
  
          def extract_video_object(e):
-            assert e['@type'] == 'VideoObject'
+            assert is_type(e, 'VideoObject')
              author = e.get('author')
              info.update({
                  'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none),
@@ -1503,13 +1505,12 @@ def traverse_json_ld(json_ld, at_top_level=True):
                  if at_top_level and set(e.keys()) == {'@context', '@graph'}:
                      traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
                      break
-                item_type = e.get('@type')
-                if expected_type is not None and expected_type != item_type:
+                if expected_type is not None and not is_type(e, expected_type):
                      continue
                  rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
                  if rating is not None:
                      info['average_rating'] = rating
-                if item_type in ('TVEpisode', 'Episode'):
+                if is_type(e, 'TVEpisode', 'Episode'):
                      episode_name = unescapeHTML(e.get('name'))
                      info.update({
                          'episode': episode_name,
@@ -1519,39 +1520,39 @@ def traverse_json_ld(json_ld, at_top_level=True):
                      if not info.get('title') and episode_name:
                          info['title'] = episode_name
                      part_of_season = e.get('partOfSeason')
-                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
+                    if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
                          info.update({
                              'season': unescapeHTML(part_of_season.get('name')),
                              'season_number': int_or_none(part_of_season.get('seasonNumber')),
                          })
                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
-                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
+                    if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
                          info['series'] = unescapeHTML(part_of_series.get('name'))
-                elif item_type == 'Movie':
+                elif is_type(e, 'Movie'):
                      info.update({
                          'title': unescapeHTML(e.get('name')),
                          'description': unescapeHTML(e.get('description')),
                          'duration': parse_duration(e.get('duration')),
                          'timestamp': unified_timestamp(e.get('dateCreated')),
                      })
-                elif item_type in ('Article', 'NewsArticle'):
+                elif is_type(e, 'Article', 'NewsArticle'):
                      info.update({
                          'timestamp': parse_iso8601(e.get('datePublished')),
                          'title': unescapeHTML(e.get('headline')),
                          'description': unescapeHTML(e.get('articleBody') or e.get('description')),
                      })
-                    if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
+                    if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
                          extract_video_object(e['video'][0])
-                    elif traverse_obj(e, ('subjectOf', 0, '@type')) == 'VideoObject':
+                    elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
                          extract_video_object(e['subjectOf'][0])
-                elif item_type == 'VideoObject':
+                elif is_type(e, 'VideoObject'):
                      extract_video_object(e)
                      if expected_type is None:
                          continue
                      else:
                          break
                  video = e.get('video')
-                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                if is_type(video, 'VideoObject'):
                      extract_video_object(video)
                  if expected_type is None:
                      continue
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index 1fcb0a53bf4f963ff496917592edef39278a1d4b..e1bf838d2e21d3ad00045f4208ea7e42443750a0 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2596,8 +2596,23 @@ class GenericIE(InfoExtractor):
                  'uploader': 'Mr Producer Media',
                  'upload_date': '20220610',
              }
-
-        }
+        },
+        {
+            'note': 'JSON LD with multiple @type',
+            'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
+            'md5': 'c7949f34f57273013fb7ccb1156393db',
+            'info_dict': {
+                'id': 'ipy2AcGL',
+                'ext': 'mp4',
+                'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
+                'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg',
+                'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
+                'timestamp': 1586577474,
+                'upload_date': '20200411',
+                'age_limit': 0,
+                'duration': 111.0,
+            }
+        },
      ]
  
      def report_following_redirect(self, new_url):
diff --git a/yt_dlp/extractor/jwplatform.py b/yt_dlp/extractor/jwplatform.py

index 1bf58d517a7404db5b2c2613ab10638f049f6d22..2cb7ca3d7a6013b98027be6b345e19da566fe30a 100644 (file)
--- a/yt_dlp/extractor/jwplatform.py
+++ b/yt_dlp/extractor/jwplatform.py
@@ -5,7 +5,7 @@
  
  
  class JWPlatformIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
      _TESTS = [{
          'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
          'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
author	pukkandan <redacted>
	Mon, 13 Jun 2022 13:09:58 +0000 (18:39 +0530)
committer	pukkandan <redacted>
	Mon, 13 Jun 2022 13:42:34 +0000 (19:12 +0530)
yt_dlp/extractor/common.py		patch \| blob \| blame \| history
yt_dlp/extractor/generic.py		patch \| blob \| blame \| history
yt_dlp/extractor/jwplatform.py		patch \| blob \| blame \| history