]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/glomex.py
[ie/matchtv] Fix extractor (#10190)
[yt-dlp.git] / yt_dlp / extractor / glomex.py
index 247a65a79620241e7b42e108c55b66ff05ad4701..35ffad56c2c56ef18f8d295a697e7c9b63ea345f 100644 (file)
@@ -1,13 +1,11 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import re
 import urllib.parse
 
 from .common import InfoExtractor
 from ..utils import (
-    determine_ext,
     ExtractorError,
+    determine_ext,
+    extract_attributes,
     int_or_none,
     parse_qs,
     smuggle_url,
@@ -51,15 +49,15 @@ def _download_api_data(self, video_id, integration, current_url=None):
         video_id_type = self._get_videoid_type(video_id)
         return self._download_json(
             self._API_URL,
-            video_id, 'Downloading %s JSON' % video_id_type,
-            'Unable to download %s JSON' % video_id_type,
+            video_id, f'Downloading {video_id_type} JSON',
+            f'Unable to download {video_id_type} JSON',
             query=query)
 
     def _download_and_extract_api_data(self, video_id, integration, current_url):
         api_data = self._download_api_data(video_id, integration, current_url)
         videos = api_data['videos']
         if not videos:
-            raise ExtractorError('no videos found for %s' % video_id)
+            raise ExtractorError(f'no videos found for {video_id}')
         videos = [self._extract_api_data(video, video_id) for video in videos]
         return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
 
@@ -75,7 +73,7 @@ def _extract_api_data(self, video, video_id):
                     format_url, video_id, 'mp4', m3u8_id=format_id,
                     fatal=False)
                 formats.extend(formats_)
-                subs.update(subs_)
+                self._merge_subtitles(subs_, target=subs)
             else:
                 formats.append({
                     'url': format_url,
@@ -84,7 +82,6 @@ def _extract_api_data(self, video, video_id):
         if video.get('language'):
             for fmt in formats:
                 fmt['language'] = video['language']
-        self._sort_formats(formats)
 
         images = (video.get('images') or []) + [video.get('image') or {}]
         thumbnails = [{
@@ -176,50 +173,39 @@ def build_player_url(cls, video_id, integration, origin_url=None):
         return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
 
     @classmethod
-    def _extract_urls(cls, webpage, origin_url):
-        # in comparison with _VALID_URL:
-        # * make the scheme optional
-        # * simplify the query string part; after extracting iframe src, the URL will be matched again
-        VALID_SRC = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+'
-
+    def _extract_embed_urls(cls, url, webpage):
         # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
-        EMBED_RE = r'''(?x)(?:
-            <iframe[^>]+?src=(?P<_q1>%(quot_re)s)(?P<url>%(url_re)s)(?P=_q1)|
-            <(?P<html_tag>glomex-player|div)(?:
-                data-integration-id=(?P<_q2>%(quot_re)s)(?P<integration_html>(?:(?!(?P=_q2)).)+)(?P=_q2)|
-                data-playlist-id=(?P<_q3>%(quot_re)s)(?P<id_html>(?:(?!(?P=_q3)).)+)(?P=_q3)|
-                data-glomex-player=(?P<_q4>%(quot_re)s)(?P<glomex_player>true)(?P=_q4)|
-                [^>]*?
-            )+>|
-            # naive parsing of inline scripts for hard-coded integration parameters
-            <(?P<script_tag>script)[^<]*?>(?:
-                (?P<_stjs1>dataset\.)?integrationId\s*(?(_stjs1)=|:)\s*
-                    (?P<_q5>%(quot_re)s)(?P<integration_js>(?:(?!(?P=_q5)).)+)(?P=_q5)\s*(?(_stjs1);|,)?|
-                (?P<_stjs2>dataset\.)?playlistId\s*(?(_stjs2)=|:)\s*
-                    (?P<_q6>%(quot_re)s)(?P<id_js>(?:(?!(?P=_q6)).)+)(?P=_q6)\s*(?(_stjs2);|,)?|
-                (?:\s|.)*?
-            )+</script>
-        )''' % {'quot_re': r'["\']', 'url_re': VALID_SRC}
-
-        for mobj in re.finditer(EMBED_RE, webpage):
-            mdict = mobj.groupdict()
-            if mdict.get('url'):
-                url = unescapeHTML(mdict['url'])
-                if url.startswith('//'):
-                    url = f'https:{url}'
-                if not cls.suitable(url):
-                    continue
-                yield cls._smuggle_origin_url(url, origin_url)
-            elif mdict.get('html_tag'):
-                if mdict['html_tag'] == 'div' and not mdict.get('glomex_player'):
-                    continue
-                if not mdict.get('video_id_html') or not mdict.get('integration_html'):
-                    continue
-                yield cls.build_player_url(mdict['video_id_html'], mdict['integration_html'], origin_url)
-            elif mdict.get('script_tag'):
-                if not mdict.get('video_id_js') or not mdict.get('integration_js'):
-                    continue
-                yield cls.build_player_url(mdict['video_id_js'], mdict['integration_js'], origin_url)
+        quot_re = r'["\']'
+
+        regex = fr'''(?x)
+            <iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
+                (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
+            )(?P=q)'''
+        for mobj in re.finditer(regex, webpage):
+            embed_url = unescapeHTML(mobj.group('url'))
+            if cls.suitable(embed_url):
+                yield cls._smuggle_origin_url(embed_url, url)
+
+        regex = fr'''(?x)
+            <glomex-player [^>]+?>|
+            <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
+        for mobj in re.finditer(regex, webpage):
+            attrs = extract_attributes(mobj.group(0))
+            if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
+                yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url)
+
+        # naive parsing of inline scripts for hard-coded integration parameters
+        regex = fr'''(?x)
+            (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
+            (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
+        for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
+            script = mobj.group(0)
+            integration_id = re.search(regex % 'integrationId', script)
+            if not integration_id:
+                continue
+            playlist_id = re.search(regex % 'playlistId', script)
+            if playlist_id:
+                yield cls.build_player_url(playlist_id, integration_id, url)
 
     def _real_extract(self, url):
         url, origin_url = self._unsmuggle_origin_url(url)