[ie/matchtv] Fix extractor (#10190)

[yt-dlp.git] / yt_dlp / extractor / gdcvault.py
diff --git a/yt_dlp/extractor/gdcvault.py b/yt_dlp/extractor/gdcvault.py

index a248a170db5959844f127d724915ef588a7fda39..5d45240650c55293c8a43694b1b770df047f5d87 100644 (file)
--- a/yt_dlp/extractor/gdcvault.py
+++ b/yt_dlp/extractor/gdcvault.py
@@ -1,16 +1,13 @@
-from __future__ import unicode_literals
-
  import re
  
  from .common import InfoExtractor
  from .kaltura import KalturaIE
-from ..utils import (
-    sanitized_Request,
-    urlencode_postdata,
-)
+from ..networking import HEADRequest, Request
+from ..utils import remove_start, smuggle_url, urlencode_postdata
  
  
  class GDCVaultIE(InfoExtractor):
+    _WORKING = False
      _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?'
      _NETRC_MACHINE = 'gdcvault'
      _TESTS = [
@@ -21,8 +18,8 @@ class GDCVaultIE(InfoExtractor):
                  'id': '201311826596_AWNY',
                  'display_id': 'Doki-Doki-Universe-Sweet-Simple',
                  'ext': 'mp4',
-                'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
-            }
+                'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)',
+            },
          },
          {
              'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
@@ -30,11 +27,11 @@ class GDCVaultIE(InfoExtractor):
                  'id': '201203272_1330951438328RSXR',
                  'display_id': 'Embracing-the-Dark-Art-of',
                  'ext': 'flv',
-                'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+                'title': 'Embracing the Dark Art of Mathematical Modeling in AI',
              },
              'params': {
                  'skip_download': True,  # Requires rtmpdump
-            }
+            },
          },
          {
              'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or',
@@ -100,6 +97,26 @@ class GDCVaultIE(InfoExtractor):
                  'format': 'mp4-408',
              },
          },
+        {
+            # Kaltura embed, whitespace between quote and embedded URL in iframe's src
+            'url': 'https://www.gdcvault.com/play/1025699',
+            'info_dict': {
+                'id': '0_zagynv0a',
+                'ext': 'mp4',
+                'title': 'Tech Toolbox',
+                'upload_date': '20190408',
+                'uploader_id': 'joe@blazestreaming.com',
+                'timestamp': 1554764629,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # HTML5 video
+            'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru',
+            'only_matching': True,
+        },
      ]
  
      def _login(self, webpage_url, display_id):
@@ -117,41 +134,81 @@ def _login(self, webpage_url, display_id):
              'password': password,
          }
  
-        request = sanitized_Request(login_url, urlencode_postdata(login_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        request = Request(login_url, urlencode_postdata(login_form))
+        request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
          self._download_webpage(request, display_id, 'Logging in')
-        webpage = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
+        start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
          self._download_webpage(logout_url, display_id, 'Logging out')
  
-        return webpage
+        return start_page
  
      def _real_extract(self, url):
-        video_id, name = re.match(self._VALID_URL, url).groups()
+        video_id, name = self._match_valid_url(url).groups()
          display_id = name or video_id
  
-        webpage = self._download_webpage(url, display_id)
-
-        title = self._html_search_regex(
-            r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>',
-            webpage, 'title')
-
-        PLAYER_REGEX = r'<iframe src=\"(?P<manifest_url>.*?)\".*?</iframe>'
-        manifest_url = self._html_search_regex(
-            PLAYER_REGEX, webpage, 'manifest_url')
-
-        partner_id = self._search_regex(
-            r'/p(?:artner_id)?/(\d+)', manifest_url, 'partner id',
-            default='1670711')
+        webpage_url = 'http://www.gdcvault.com/play/' + video_id
+        start_page = self._download_webpage(webpage_url, display_id)
+
+        direct_url = self._search_regex(
+            r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
+            start_page, 'url', default=None)
+        if direct_url:
+            title = self._html_search_regex(
+                r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>',
+                start_page, 'title')
+            video_url = 'http://www.gdcvault.com' + direct_url
+            # resolve the url so that we can detect the correct extension
+            video_url = self._request_webpage(
+                HEADRequest(video_url), video_id).url
+
+            return {
+                'id': video_id,
+                'display_id': display_id,
+                'url': video_url,
+                'title': title,
+            }
  
-        kaltura_id = self._search_regex(
-            r'entry_id=(?P<id>(?:[^&])+)', manifest_url,
-            'kaltura id', group='id')
+        embed_url = KalturaIE._extract_url(start_page)
+        if embed_url:
+            embed_url = smuggle_url(embed_url, {'source_url': url})
+            ie_key = 'Kaltura'
+        else:
+            PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
+
+            xml_root = self._html_search_regex(
+                PLAYER_REGEX, start_page, 'xml root', default=None)
+            if xml_root is None:
+                # Probably need to authenticate
+                login_res = self._login(webpage_url, display_id)
+                if login_res is None:
+                    self.report_warning('Could not login.')
+                else:
+                    start_page = login_res
+                    # Grab the url from the authenticated page
+                    xml_root = self._html_search_regex(
+                        PLAYER_REGEX, start_page, 'xml root')
+
+            xml_name = self._html_search_regex(
+                r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>',
+                start_page, 'xml filename', default=None)
+            if not xml_name:
+                info = self._parse_html5_media_entries(url, start_page, video_id)[0]
+                info.update({
+                    'title': remove_start(self._search_regex(
+                        r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page,
+                        'title', default=None) or self._og_search_title(
+                        start_page, default=None), 'GDC Vault - '),
+                    'id': video_id,
+                    'display_id': display_id,
+                })
+                return info
+            embed_url = f'{xml_root}/xml/{xml_name}'
+            ie_key = 'DigitallySpeaking'
  
          return {
              '_type': 'url_transparent',
-            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
-            'ie_key': KalturaIE.ie_key(),
              'id': video_id,
              'display_id': display_id,
-            'title': title,
+            'url': embed_url,
+            'ie_key': ie_key,
          }