[ie/matchtv] Fix extractor (#10190)

[yt-dlp.git] / yt_dlp / extractor / mediaklikk.py
diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py

index b9b6d739f5651d114e98c742174a3d8b4112a038..bd1a27fccca5ecd7f88af19a89585ac17e4dcf12 100644 (file)
--- a/yt_dlp/extractor/mediaklikk.py
+++ b/yt_dlp/extractor/mediaklikk.py
@@ -1,70 +1,117 @@
-# coding: utf-8
-from __future__ import unicode_literals
+import urllib.parse
  
-from ..utils import (
-    unified_strdate
-)
  from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_unquote,
-    compat_str
+from ..utils import (
+    ExtractorError,
+    traverse_obj,
+    unified_strdate,
+    url_or_none,
  )
  
  
  class MediaKlikkIE(InfoExtractor):
-    _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)?
-                        (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/
+    _VALID_URL = r'''(?x)https?://(?:www\.)?
+                        (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/
                          (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)?
                          (?P<id>[^/#?_]+)'''
  
      _TESTS = [{
-        # mediaklikk. date in html.
+        # (old) mediaklikk. date in html.
          'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/',
          'info_dict': {
              'id': '4754129',
              'title': 'Hazajáró, DÉLNYUGAT-BÁCSKA – A Duna mentén Palánkától Doroszlóig',
              'ext': 'mp4',
              'upload_date': '20210901',
-            'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg'
-        }
+            'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg',
+        },
+        'skip': 'Webpage redirects to 404 page',
      }, {
-        # m4sport
+        # mediaklikk. date in html.
+        'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/',
+        'info_dict': {
+            'id': '6696133',
+            'title': 'Hazajáró, Fabova-hegység - Kishont koronája',
+            'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja',
+            'ext': 'mp4',
+            'upload_date': '20230903',
+            'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg',
+        },
+    }, {
+        # (old) m4sport
          'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/',
          'info_dict': {
              'id': '4754999',
              'title': 'Gyémánt Liga, Párizs',
              'ext': 'mp4',
              'upload_date': '20210830',
-            'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg'
-        }
+            'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg',
+        },
+        'skip': 'Webpage redirects to 404 page',
+    }, {
+        # m4sport
+        'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/',
+        'info_dict': {
+            'id': '6711136',
+            'title': 'Atlétika – Gyémánt Liga, Brüsszel',
+            'display_id': 'atletika-gyemant-liga-brusszel',
+            'ext': 'mp4',
+            'upload_date': '20230908',
+            'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg',
+        },
      }, {
          # m4sport with *video/ url and no date
          'url': 'https://m4sport.hu/bl-video/real-madrid-chelsea-1-1/',
          'info_dict': {
              'id': '4492099',
              'title': 'Real Madrid - Chelsea 1-1',
+            'display_id': 'real-madrid-chelsea-1-1',
              'ext': 'mp4',
-            'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png'
-        }
+            'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png',
+        },
      }, {
-        # hirado
+        # (old) hirado
          'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/',
          'info_dict': {
              'id': '4760120',
              'title': 'Feltételeket szabott a főváros',
              'ext': 'mp4',
-            'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg'
-        }
+            'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg',
+        },
+        'skip': 'Webpage redirects to video list page',
      }, {
-        # petofilive
+        # hirado
+        'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
+        'info_dict': {
+            'id': '6716068',
+            'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál',
+            'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal',
+            'ext': 'mp4',
+            'upload_date': '20230911',
+            'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg',
+        },
+    }, {
+        # (old) petofilive
          'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/',
          'info_dict': {
              'id': '4571948',
              'title': 'Tha Shudras az Akusztikban',
              'ext': 'mp4',
              'upload_date': '20210607',
-            'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg'
-        }
+            'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg',
+        },
+        'skip': 'Webpage redirects to empty page',
+    }, {
+        # petofilive
+        'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/',
+        'info_dict': {
+            'id': '6713233',
+            'title': 'Futball Fesztivál a Margitszigeten',
+            'display_id': 'futball-fesztival-a-margitszigeten',
+            'ext': 'mp4',
+            'upload_date': '20230909',
+            'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg',
+        },
      }]
  
      def _real_extract(self, url):
@@ -74,25 +121,28 @@ def _real_extract(self, url):
  
          player_data_str = self._html_search_regex(
              r'mtva_player_manager\.player\(document.getElementById\(.*\),\s?(\{.*\}).*\);', webpage, 'player data')
-        player_data = self._parse_json(player_data_str, display_id, compat_urllib_parse_unquote)
-        video_id = compat_str(player_data['contentId'])
+        player_data = self._parse_json(player_data_str, display_id, urllib.parse.unquote)
+        video_id = str(player_data['contentId'])
          title = player_data.get('title') or self._og_search_title(webpage, fatal=False) or \
              self._html_search_regex(r'<h\d+\b[^>]+\bclass="article_title">([^<]+)<', webpage, 'title')
  
          upload_date = unified_strdate(
-            '%s-%s-%s' % (mobj.group('year'), mobj.group('month'), mobj.group('day')))
+            '{}-{}-{}'.format(mobj.group('year'), mobj.group('month'), mobj.group('day')))
          if not upload_date:
              upload_date = unified_strdate(self._html_search_regex(
                  r'<p+\b[^>]+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None))
  
          player_data['video'] = player_data.pop('token')
          player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data)
-        playlist_url = self._proto_relative_url(compat_urllib_parse_unquote(
-            self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/'))
+        player_json = self._search_json(
+            r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);')
+        playlist_url = traverse_obj(
+            player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False)
+        if not playlist_url:
+            raise ExtractorError('Unable to extract playlist url')
  
          formats = self._extract_wowza_formats(
              playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash'])
-        self._sort_formats(formats)
  
          return {
              'id': video_id,
@@ -100,5 +150,5 @@ def _real_extract(self, url):
              'display_id': display_id,
              'formats': formats,
              'upload_date': upload_date,
-            'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage)
+            'thumbnail': player_data.get('bgImage') or self._og_search_thumbnail(webpage),
          }